import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests
filename = 'https://gitlab.com/rajacsp/datasets/raw/master/stack-overflow-data.csv'
r = requests.get(filename)
data = r.content
df = pd.read_csv(BytesIO(data))
|
post |
tags |
| 0 |
what is causing this behavior in our c# datet... |
c# |
| 1 |
have dynamic html load as if it was in an ifra... |
asp.net |
| 2 |
how to convert a float value in to min:sec i ... |
objective-c |
| 3 |
.net framework 4 redistributable just wonderi... |
.net |
| 4 |
trying to calculate and print the mean and its... |
python |
# remove not null
df = df[pd.notnull(df['tags'])]
|
post |
tags |
| count |
40000 |
40000 |
| unique |
40000 |
20 |
| top |
what is causing this behavior in our c# datet... |
c# |
| freq |
1 |
2000 |
# Count words
df.head(10)
|
post |
tags |
| 0 |
what is causing this behavior in our c# datet... |
c# |
| 1 |
have dynamic html load as if it was in an ifra... |
asp.net |
| 2 |
how to convert a float value in to min:sec i ... |
objective-c |
| 3 |
.net framework 4 redistributable just wonderi... |
.net |
| 4 |
trying to calculate and print the mean and its... |
python |
| 5 |
how to give alias name for my website i have ... |
asp.net |
| 6 |
window.open() returns null in angularjs it wo... |
angularjs |
| 7 |
identifying server timeout quickly in iphone ... |
iphone |
| 8 |
unknown method key error in rails 2.3.8 unit ... |
ruby-on-rails |
| 9 |
from the include how to show and hide the con... |
angularjs |
0 what is causing this behavior in our c# datet...
1 have dynamic html load as if it was in an ifra...
2 how to convert a float value in to min:sec i ...
3 .net framework 4 redistributable just wonderi...
4 trying to calculate and print the mean and its...
...
39995 different output if at end of function rather ...
39996 multiple arrays is there a way to access/stor...
39997 c - how to differentiate a second same key pre...
39998 state.go not working (#! & url is being append...
39999 understanding the mechanisms of intentservice ...
Name: post, Length: 40000, dtype: object
# count words
df['post'].apply ( lambda x: len(x.split(' ')) ).sum()
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace unwanted symbols with space
text = BAD_SYMBOLS_RE.sub('', text) # remove bad symbols
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords
return text
df['post'] = df['post'].apply(clean_text)
# Check the words count again
df['post'].apply ( lambda x: len(x.split(' ')) ).sum()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Using NB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)
Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
pipeline = Pipeline([
('vect', CountVectorizer(
analyzer='word',
binary=False,
decode_error='strict',
input='content',
lowercase=True,
max_df=1.0,
min_df=1,
ngram_range=(1, 1),
)),
('tfidf', TfidfTransformer(
use_idf=True
)),
('clf', MultinomialNB(
alpha=1.0
))
])
pipeline.fit(X_train, y_train)
Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
from sklearn.metrics import accuracy_score
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
Accuracy: 0.7394166666666667
my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']
print(classification_report(y_test, y_pred,target_names=my_tags))
precision recall f1-score support
java 0.63 0.65 0.64 613
html 0.94 0.86 0.90 620
asp.net 0.87 0.92 0.90 587
c# 0.70 0.77 0.73 586
ruby-on-rails 0.73 0.87 0.79 599
jquery 0.72 0.51 0.60 589
mysql 0.77 0.74 0.75 594
php 0.69 0.89 0.78 610
ios 0.63 0.59 0.61 617
javascript 0.57 0.65 0.60 587
python 0.70 0.50 0.59 611
c 0.79 0.79 0.79 594
css 0.84 0.59 0.69 619
android 0.65 0.84 0.74 574
iphone 0.64 0.83 0.72 584
sql 0.66 0.64 0.65 578
objective-c 0.79 0.77 0.78 591
c++ 0.89 0.83 0.86 608
angularjs 0.94 0.89 0.91 638
.net 0.74 0.66 0.70 601
accuracy 0.74 12000
macro avg 0.74 0.74 0.74 12000
weighted avg 0.75 0.74 0.74 12000
Score: 25