import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
from jupyterthemes import jtplot
jtplot.style('grade3', context='poster', fscale=1.5, ticks=True, grid=False)
jtplot.figsize(x=12., y=7., aspect = 1.5)
%matplotlib inline
ls
255fk.jpg Youtube01-Psy.csv Youtube04-Eminem.csv alice_mask.png Youtube02-KatyPerry.csv Youtube05-Shakira.csv __MACOSX/ Youtube03-LMFAO.csv youtube-comments-spam-detection.ipynb
csvs = ['Youtube01-Psy.csv','Youtube02-KatyPerry.csv','Youtube03-LMFAO.csv','Youtube04-Eminem.csv','Youtube05-Shakira.csv']
dfs = []
for csv in csvs:
df = pd.read_csv(csv)
dfs.append(df)
data = pd.concat(dfs)
data.head()
COMMENT_ID | AUTHOR | DATE | CONTENT | CLASS | |
---|---|---|---|---|---|
0 | LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU | Julius NM | 2013-11-07T06:20:48 | Huh, anyway check out this you[tube] channel: ... | 1 |
1 | LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A | adam riyati | 2013-11-07T12:37:15 | Hey guys check out my new channel and our firs... | 1 |
2 | LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8 | Evgeny Murashkin | 2013-11-08T17:34:21 | just for test I have to say murdev.com | 1 |
3 | z13jhp0bxqncu512g22wvzkasxmvvzjaz04 | ElNino Melendez | 2013-11-09T08:28:43 | me shaking my sexy ass on my channel enjoy ^_^ | 1 |
4 | z13fwbwp1oujthgqj04chlngpvzmtt3r3dw | GsMega | 2013-11-10T16:05:38 | watch?v=vtaRGgvGtWQ Check this out . | 1 |
** we need only the content and the class **
data = data[['CONTENT','CLASS']]
data.head()
CONTENT | CLASS | |
---|---|---|
0 | Huh, anyway check out this you[tube] channel: ... | 1 |
1 | Hey guys check out my new channel and our firs... | 1 |
2 | just for test I have to say murdev.com | 1 |
3 | me shaking my sexy ass on my channel enjoy ^_^ | 1 |
4 | watch?v=vtaRGgvGtWQ Check this out . | 1 |
data.CLASS = data.CLASS.map({0:'HAM',1:'SPAM'})
** Preprocessing the data **
Let's remove all punctuations & special characters, numerics and keep the words only.
re.findall("[A-Za-z]+","hey 123how are you")
['hey', 'how', 'are', 'you']
def preprocess(text):
text_cleaned = re.findall('[A-Za-z]+',text.lower())
return text_cleaned
data['PREPROCESSED'] = data.CONTENT.apply(lambda text: ' '.join(preprocess(text)))
data.head()
CONTENT | CLASS | PREPROCESSED | |
---|---|---|---|
0 | Huh, anyway check out this you[tube] channel: ... | SPAM | huh anyway check out this you tube channel kob... |
1 | Hey guys check out my new channel and our firs... | SPAM | hey guys check out my new channel and our firs... |
2 | just for test I have to say murdev.com | SPAM | just for test i have to say murdev com |
3 | me shaking my sexy ass on my channel enjoy ^_^ | SPAM | me shaking my sexy ass on my channel enjoy |
4 | watch?v=vtaRGgvGtWQ Check this out . | SPAM | watch v vtarggvgtwq check this out |
** visualize words for spam and ham **
spam_words = reduce(lambda x,y: x+" "+y , data[data.CLASS == 'SPAM'].PREPROCESSED)
ham_words = reduce(lambda x,y: x+" "+y , data[data.CLASS == 'HAM'].PREPROCESSED)
from collections import Counter
spam_word_freq = Counter(spam_words.split())
ham_word_freq = Counter(ham_words.split())
sw_df= pd.DataFrame(spam_word_freq.most_common(),columns=['word','freq'])
sw_df.head()
word | freq | |
---|---|---|
0 | i | 612 |
1 | out | 571 |
2 | check | 559 |
3 | my | 531 |
4 | you | 529 |
hw_df= pd.DataFrame(ham_word_freq.most_common(),columns=['word','freq'])
hw_df.head()
word | freq | |
---|---|---|
0 | i | 316 |
1 | this | 301 |
2 | the | 283 |
3 | song | 224 |
4 | is | 217 |
fig, ax = plt.subplots(figsize=(12,6))
sw_df[:50].plot(x='word',y='freq',kind='bar',ax = ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5ccbc55d10>
fig, ax = plt.subplots(figsize=(12,6))
hw_df[:50].plot(x='word',y='freq',kind='bar',ax = ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5ccba33fd0>
Spam/Ham WordCloud
from wordcloud import WordCloud,STOPWORDS
from PIL import Image
alice_mask = np.array(Image.open("alice_mask.png"))
stopwords = set(STOPWORDS)
wc_spam = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
stopwords=stopwords)
wc_ham = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
stopwords=stopwords)
wc_spam.generate(spam_words)
wc_ham.generate(ham_words)
<wordcloud.wordcloud.WordCloud at 0x7f5ccb8e6b90>
# show
plt.figure(figsize=(15,12))
plt.imshow(wc_spam, interpolation='bilinear')
plt.axis("off")
plt.show()
# show
plt.figure(figsize=(15,12))
plt.imshow(wc_ham, interpolation='bilinear')
plt.axis("off")
plt.show()
** Bag of Words Model **
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,cross_val_score
x_train,x_test,y_train,y_test = train_test_split(data.PREPROCESSED,data.CLASS,test_size=0.1,random_state=2017)
print x_train.shape,y_test.shape
(1760,) (196,)
clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
clf.fit(x_train,y_train)
cvs = cross_val_score(clf,x_train,y_train,cv=10,verbose=0,n_jobs=4)
print "Accuracy: {} +-{} ".format(round(cvs.mean(),2), round(cvs.std(),2))
** plot the confusion matrix **
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Accent):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.figure(figsize=(12,8))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
from sklearn.metrics import confusion_matrix,classification_report
y_true = y_test
y_pred = clf.predict(x_test)
print "classification report: "
print classification_report(y_true,y_pred,target_names=clf.classes_)
conf_mat = confusion_matrix(y_true,y_pred)
plot_confusion_matrix(conf_mat,clf.classes_,normalize=True)
classification report: precision recall f1-score support HAM 0.92 0.86 0.89 90 SPAM 0.88 0.93 0.91 106 avg / total 0.90 0.90 0.90 196 Normalized confusion matrix [[ 0.85555556 0.14444444] [ 0.06603774 0.93396226]]
Pretty neat eh? Without any optmization we are able to achieve 92% accuracy on cross-validation & 90% accuracy on test set
** Can we do better ? **
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier(n_estimators=1000,n_jobs=4,random_state=2017))])
clf.fit(x_train,y_train)
cvs = cross_val_score(clf,x_train,y_train,cv=10,verbose=0,n_jobs=4,scoring='f1_weighted')
print "Accuracy: {} +-{} ".format(round(cvs.mean(),2), round(cvs.std(),2))
Accuracy: 0.96 +-0.01
from sklearn.metrics import confusion_matrix,classification_report
y_true = y_test
y_pred = clf.predict(x_test)
print "classification report: "
print classification_report(y_true,y_pred,target_names=clf.classes_)
conf_mat = confusion_matrix(y_true,y_pred)
plot_confusion_matrix(conf_mat,clf.classes_,normalize=True)
classification report: precision recall f1-score support HAM 0.95 1.00 0.97 90 SPAM 1.00 0.95 0.98 106 avg / total 0.98 0.97 0.97 196 Normalized confusion matrix [[ 1. 0. ] [ 0.04716981 0.95283019]]
wow! we achieved a whopping 96% accuracy on cross-validation and 97% accuracy on the test-set using a Random Forest of 1000 trees
We have not removed any stop-words and still achieved great accuracy. An interesting follow-up would be to remove stop-words and check
if it has any impact on the performance.