Text-Classification

Sun 29 June 2025

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from io import BytesIO
import requests

filename = 'https://gitlab.com/rajacsp/datasets/raw/master/stack-overflow-data.csv'

r = requests.get(filename)
data = r.content

df = pd.read_csv(BytesIO(data))

df.head()

	post	tags
0	what is causing this behavior in our c# datet...	c#
1	have dynamic html load as if it was in an ifra...	asp.net
2	how to convert a float value in to min:sec i ...	objective-c
3	.net framework 4 redistributable just wonderi...	.net
4	trying to calculate and print the mean and its...	python

# remove not null

df = df[pd.notnull(df['tags'])]

df.describe()

	post	tags
count	40000	40000
unique	40000	20
top	what is causing this behavior in our c# datet...	c#
freq	1	2000

df.shape

(40000, 2)

# Count words

df.head(10)

	post	tags
0	what is causing this behavior in our c# datet...	c#
1	have dynamic html load as if it was in an ifra...	asp.net
2	how to convert a float value in to min:sec i ...	objective-c
3	.net framework 4 redistributable just wonderi...	.net
4	trying to calculate and print the mean and its...	python
5	how to give alias name for my website i have ...	asp.net
6	window.open() returns null in angularjs it wo...	angularjs
7	identifying server timeout quickly in iphone ...	iphone
8	unknown method key error in rails 2.3.8 unit ...	ruby-on-rails
9	from the include how to show and hide the con...	angularjs

df['post']

0        what is causing this behavior  in our c# datet...
1        have dynamic html load as if it was in an ifra...
2        how to convert a float value in to min:sec  i ...
3        .net framework 4 redistributable  just wonderi...
4        trying to calculate and print the mean and its...
                               ...                        
39995    different output if at end of function rather ...
39996    multiple arrays  is there a way to access/stor...
39997    c - how to differentiate a second same key pre...
39998    state.go not working (#! & url is being append...
39999    understanding the mechanisms of intentservice ...
Name: post, Length: 40000, dtype: object

# count words
df['post'].apply ( lambda x: len(x.split(' ')) ).sum()

10286120

import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text  # HTML decoding
    text = text.lower()  # lowercase
    text = REPLACE_BY_SPACE_RE.sub(' ', text)  # replace unwanted symbols with space
    text = BAD_SYMBOLS_RE.sub('', text)  # remove bad symbols
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  # remove stopwords
    return text

df['post'] = df['post'].apply(clean_text)

# Check the words count again

df['post'].apply ( lambda x: len(x.split(' ')) ).sum()

X = df.post
y = df.tags

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pipeline

?Documentation for PipelineiFitted

Parameters

	steps	[('vect', ...), ('tfidf', ...), ...]
	transform_input	None
	memory	None
	verbose	False

CountVectorizer

?Documentation for CountVectorizer

Parameters

	input	'content'
	encoding	'utf-8'
	decode_error	'strict'
	strip_accents	None
	lowercase	True
	preprocessor	None
	tokenizer	None
	stop_words	None
	token_pattern	'(?u)\\b\\w\\w+\\b'
	ngram_range	(1, ...)
	analyzer	'word'
	max_df	1.0
	min_df	1
	max_features	None
	vocabulary	None
	binary	False
	dtype	<class 'numpy.int64'>

TfidfTransformer

?Documentation for TfidfTransformer

Parameters

	norm	'l2'
	use_idf	True
	smooth_idf	True
	sublinear_tf	False

MultinomialNB

?Documentation for MultinomialNB

Parameters

	alpha	1.0
	force_alpha	True
	fit_prior	True
	class_prior	None

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

print(pipeline)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

nb.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

Pipeline

?Documentation for PipelineiFitted

Parameters

	steps	[('vect', ...), ('tfidf', ...), ...]
	transform_input	None
	memory	None
	verbose	False

CountVectorizer

?Documentation for CountVectorizer

Parameters

	input	'content'
	encoding	'utf-8'
	decode_error	'strict'
	strip_accents	None
	lowercase	True
	preprocessor	None
	tokenizer	None
	stop_words	None
	token_pattern	'(?u)\\b\\w\\w+\\b'
	ngram_range	(1, ...)
	analyzer	'word'
	max_df	1.0
	min_df	1
	max_features	None
	vocabulary	None
	binary	False
	dtype	<class 'numpy.int64'>

TfidfTransformer

?Documentation for TfidfTransformer

Parameters

	norm	'l2'
	use_idf	True
	smooth_idf	True
	sublinear_tf	False

MultinomialNB

?Documentation for MultinomialNB

Parameters

	alpha	1.0
	force_alpha	True
	fit_prior	True
	class_prior	None

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('vect', CountVectorizer(
        analyzer='word',
        binary=False,
        decode_error='strict',
        input='content',
        lowercase=True,
        max_df=1.0,
        min_df=1,
        ngram_range=(1, 1),
    )),
    ('tfidf', TfidfTransformer(
        use_idf=True
    )),
    ('clf', MultinomialNB(
        alpha=1.0
    ))
])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

Pipeline

?Documentation for PipelineiFitted

Parameters

	steps	[('vect', ...), ('tfidf', ...), ...]
	transform_input	None
	memory	None
	verbose	False

CountVectorizer

?Documentation for CountVectorizer

Parameters

	input	'content'
	encoding	'utf-8'
	decode_error	'strict'
	strip_accents	None
	lowercase	True
	preprocessor	None
	tokenizer	None
	stop_words	None
	token_pattern	'(?u)\\b\\w\\w+\\b'
	ngram_range	(1, ...)
	analyzer	'word'
	max_df	1.0
	min_df	1
	max_features	None
	vocabulary	None
	binary	False
	dtype	<class 'numpy.int64'>

TfidfTransformer

?Documentation for TfidfTransformer

Parameters

	norm	'l2'
	use_idf	True
	smooth_idf	True
	sublinear_tf	False

MultinomialNB

?Documentation for MultinomialNB

Parameters

	alpha	1.0
	force_alpha	True
	fit_prior	True
	class_prior	None

from sklearn.metrics import accuracy_score
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7394166666666667

my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']


print(classification_report(y_test, y_pred,target_names=my_tags))

               precision    recall  f1-score   support

         java       0.63      0.65      0.64       613
         html       0.94      0.86      0.90       620
      asp.net       0.87      0.92      0.90       587
           c#       0.70      0.77      0.73       586
ruby-on-rails       0.73      0.87      0.79       599
       jquery       0.72      0.51      0.60       589
        mysql       0.77      0.74      0.75       594
          php       0.69      0.89      0.78       610
          ios       0.63      0.59      0.61       617
   javascript       0.57      0.65      0.60       587
       python       0.70      0.50      0.59       611
            c       0.79      0.79      0.79       594
          css       0.84      0.59      0.69       619
      android       0.65      0.84      0.74       574
       iphone       0.64      0.83      0.72       584
          sql       0.66      0.64      0.65       578
  objective-c       0.79      0.77      0.78       591
          c++       0.89      0.83      0.86       608
    angularjs       0.94      0.89      0.91       638
         .net       0.74      0.66      0.70       601

     accuracy                           0.74     12000
    macro avg       0.74      0.74      0.74     12000
 weighted avg       0.75      0.74      0.74     12000

Score: 25

Category: basics