import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob

files = glob.glob('youtube-dataset/youtube-dataset\\*.csv') # To get all files in csv format

files  # List of the path names

['youtube-dataset/youtube-dataset\\Youtube01.csv',
 'youtube-dataset/youtube-dataset\\Youtube02.csv',
 'youtube-dataset/youtube-dataset\\Youtube03.csv',
 'youtube-dataset/youtube-dataset\\Youtube04.csv',
 'youtube-dataset/youtube-dataset\\Youtube05.csv']

datasets = []
for i in files:  # To pick the file names one by one
    datasets.append(pd.read_csv(i).drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis = 1))
    # Reading from all files and dropping the columns that we don't need from each

datasets[0].head()   # 0 index is the first csv file

type(datasets), len(datasets)

(list, 5)

# Let's combine all csv files into a one dataframe

# First argument is the pandas object we want to concatenate
# Second argument is to Concatenate along the horizontal axis
# Third argument is to drop the original indexing and enumerate from 0 to n-1
df = pd.concat(datasets, axis = 0, ignore_index=True)
df.head()

type(df), df.shape

(pandas.core.frame.DataFrame, (1955, 2))

df.isna().sum()  # There are no nulls

CONTENT    0
CLASS      0
dtype: int64

df['CLASS'].value_counts()  # 1 is spam and 0 is ham

CLASS
1    1004
0     951
Name: count, dtype: int64

from sklearn.feature_extraction.text import CountVectorizer

text_example = ['She is learning something new']

vectorizer_sample = CountVectorizer()
vectorizer_sample.fit(text_example)
vectorizer_sample.transform(text_example).toarray() # As it creates sparse matrix

array([[1, 1, 1, 1, 1]], dtype=int64)

vectorizer_sample.get_feature_names_out()

array(['is', 'learning', 'new', 'she', 'something'], dtype=object)

text_example2 = ['He is learning to drive']
vectorizer_sample.transform(text_example2).toarray()

array([[1, 1, 0, 0, 0]], dtype=int64)

vectorizer_sample.get_feature_names_out()

array(['is', 'learning', 'new', 'she', 'something'], dtype=object)

# Now let's use fit_transform with two sentences

text_example = ['She is learning something new', 'He is learning to drive']
vectorizer_sample = CountVectorizer()
vectorizer_sample.fit_transform(text_example).toarray()

array([[0, 0, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1]], dtype=int64)

vectorizer_sample.get_feature_names_out()

array(['drive', 'he', 'is', 'learning', 'new', 'she', 'something', 'to'],
      dtype=object)

from sklearn.model_selection import train_test_split

X = df['CONTENT']
y = df['CLASS']

x_train, x_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1, 
                                                    random_state=42,
                                                   stratify= y)  
# Stratify is done so that equal portion of the target enters training and testing set

x_train.shape, x_test.shape

((1759,), (196,))

y_train.value_counts(normalize=True)  # Almost equally spread

CLASS
1    0.51336
0    0.48664
Name: proportion, dtype: float64

y_test.value_counts(normalize=True)  # Almost equally spread

CLASS
1    0.515306
0    0.484694
Name: proportion, dtype: float64

# Now getting started with vectorizer

vectorizer = CountVectorizer()
x_train_transformed = vectorizer.fit_transform(x_train)
x_test_transformed = vectorizer.transform(x_test)
x_train_transformed.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

x_train_transformed.shape  # There are 1759 rows and 4210 columns which are actually the different words present in the vocabulary created

(1759, 4210)

x_test_transformed.shape

(196, 4210)

from sklearn.naive_bayes import MultinomialNB
# Since our dataset is textual and the classes are roughly balanced

clf = MultinomialNB()
clf.fit(x_train_transformed, y_train)

MultinomialNB()

MultinomialNB()

clf.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}

np.exp(clf.class_log_prior_)

array([0.48664014, 0.51335986])

predictions = clf.predict(x_test_transformed)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

print(confusion_matrix(predictions, y_test))

[[82  6]
 [13 95]]

ConfusionMatrixDisplay.from_predictions(
    predictions, y_test,
    labels = clf.classes_,
    cmap = 'magma'
);

from sklearn.metrics import classification_report

print(classification_report(predictions, y_test,
                           target_names=['Ham', 'Spam']))

              precision    recall  f1-score   support

         Ham       0.86      0.93      0.90        88
        Spam       0.94      0.88      0.91       108

    accuracy                           0.90       196
   macro avg       0.90      0.91      0.90       196
weighted avg       0.91      0.90      0.90       196

spam_proba = clf.predict_proba(x_test_transformed).round(3)[:,1];

df_scatter = pd.DataFrame()

df_scatter['True class'] = y_test
df_scatter['Predicted class'] = predictions
df_scatter['Predicted probability (spam)'] = spam_proba

df_scatter = df_scatter.reset_index(drop = True)

palette_0 = sns.color_palette(['#000000'])
palette_1 = sns.color_palette(['#FF0000'])

df_scatter_0 = df_scatter[df_scatter['True class'] == 0].reset_index(drop = True)
df_scatter_1 = df_scatter[df_scatter['True class'] == 1].reset_index(drop = True)

sns.set()

fig, (ax1,ax2) = plt.subplots(2,1, figsize=(12,5))
fig.tight_layout(pad = 3)

sns.scatterplot(x = 'Predicted probability (spam)', 
                y = np.zeros(df_scatter_0.shape[0]), 
                data = df_scatter_0,
                hue = 'True class', 
                s = 50,
                markers = ['o'],
                palette = palette_0,
                style = 'True class',
                legend = False, 
                ax = ax1).set(yticklabels=[])

ax1.set_title('Probability distribution of comments belonging to the true \'ham\' class')
ax1.vlines(0.5, -1, 1, linestyles = 'dashed', colors = 'red');


sns.scatterplot(x = 'Predicted probability (spam)', 
                y = np.zeros(df_scatter_1.shape[0]), 
                hue = 'True class', 
                data = df_scatter_1,
                s = 50,
                palette = palette_1,
                markers = ['X'],
                style = 'True class',
                legend = False, 
                ax = ax2).set(yticklabels=[])

ax2.set_title('Probability distribution of comments belonging to the true \'spam\' class')

ax2.vlines(0.5, -1, 1, linestyles = 'dashed', colors = 'red');

# Let's try using different hyperparameters

clf = MultinomialNB(class_prior= np.array([0.6, 0.4]))
clf.fit(x_train_transformed, y_train)
prediction_new = clf.predict(x_test_transformed)

ConfusionMatrixDisplay.from_predictions(
    prediction_new, y_test,
    labels = clf.classes_,
    cmap = 'magma'
);

print(classification_report(prediction_new, y_test,
                           target_names=['Ham', 'Spam']))

              precision    recall  f1-score   support

         Ham       0.93      0.94      0.93        94
        Spam       0.94      0.93      0.94       102

    accuracy                           0.93       196
   macro avg       0.93      0.93      0.93       196
weighted avg       0.93      0.93      0.93       196

spam_proba = clf.predict_proba(x_test_transformed).round(3)[:,1];

df_scatter = pd.DataFrame()

df_scatter['True class'] = y_test
df_scatter['Predicted class'] = prediction_new
df_scatter['Predicted probability (spam)'] = spam_proba

df_scatter = df_scatter.reset_index(drop = True)

palette_0 = sns.color_palette(['#000000'])
palette_1 = sns.color_palette(['#FF0000'])

df_scatter_0 = df_scatter[df_scatter['True class'] == 0].reset_index(drop = True)
df_scatter_1 = df_scatter[df_scatter['True class'] == 1].reset_index(drop = True)

sns.set()

fig, (ax1,ax2) = plt.subplots(2,1, figsize=(12,5))
fig.tight_layout(pad = 3)

sns.scatterplot(x = 'Predicted probability (spam)', 
                y = np.zeros(df_scatter_0.shape[0]), 
                data = df_scatter_0,
                hue = 'True class', 
                s = 50,
                markers = ['o'],
                palette = palette_0,
                style = 'True class',
                legend = False, 
                ax = ax1).set(yticklabels=[])

ax1.set_title('Probability distribution of comments belonging to the true \'ham\' class')
ax1.vlines(0.5, -1, 1, linestyles = 'dashed', colors = 'red');


sns.scatterplot(x = 'Predicted probability (spam)', 
                y = np.zeros(df_scatter_1.shape[0]), 
                hue = 'True class', 
                data = df_scatter_1,
                s = 50,
                palette = palette_1,
                markers = ['X'],
                style = 'True class',
                legend = False, 
                ax = ax2).set(yticklabels=[])

ax2.set_title('Probability distribution of comments belonging to the true \'spam\' class')

ax2.vlines(0.5, -1, 1, linestyles = 'dashed', colors = 'red');

predict_data = vectorizer.transform(['This song is amazing!',
                                     'You can win 1m dollars right now, just click here!!!'])

clf.predict(predict_data)

array([0, 1], dtype=int64)

	CONTENT	CLASS
0	Huh, anyway check out this you[tube] channel: ...	1
1	Hey guys check out my new channel and our firs...	1
2	just for test I have to say murdev.com	1
3	watch?v=vtaRGgvGtWQ Check this out .	1
4	Hey, check out my new website!! This site is a...	1

	CONTENT	CLASS
0	Huh, anyway check out this you[tube] channel: ...	1
1	Hey guys check out my new channel and our firs...	1
2	just for test I have to say murdev.com	1
3	watch?v=vtaRGgvGtWQ Check this out .	1
4	Hey, check out my new website!! This site is a...	1

Load Libraries¶

Reading Dataset¶

Checking for Nulls¶

Checking frequency of target class¶

Understanding CountVectorizer¶

Splitting the dataset¶

Tokenizing the comments from our dataset using Vectorizer¶

Applying dataset to Model¶

Evaluating the model¶

Creating probability distribution figure¶

Optimizing Model¶

Creating probability distribution figure¶

Data Prediction using Unknown Data¶