#!/usr/bin/env python # coding: utf-8 # In[20]: import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns import re from jupyterthemes import jtplot jtplot.style('grade3', context='poster', fscale=1.5, ticks=True, grid=False) jtplot.figsize(x=12., y=7., aspect = 1.5) get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: # # Load and preprocess the dataset # --- # In[2]: ls # In[3]: csvs = ['Youtube01-Psy.csv','Youtube02-KatyPerry.csv','Youtube03-LMFAO.csv','Youtube04-Eminem.csv','Youtube05-Shakira.csv'] dfs = [] for csv in csvs: df = pd.read_csv(csv) dfs.append(df) data = pd.concat(dfs) # In[4]: data.head() #

# ** we need only the content and the class ** #

# In[60]: data = data[['CONTENT','CLASS']] # In[61]: data.head() # In[125]: data.CLASS = data.CLASS.map({0:'HAM',1:'SPAM'}) #
#
# ** Preprocessing the data ** #

# Let's remove all punctuations & special characters, numerics and keep the words only. #

# In[ ]: # In[126]: re.findall("[A-Za-z]+","hey 123how are you") #

# In[127]: def preprocess(text): text_cleaned = re.findall('[A-Za-z]+',text.lower()) return text_cleaned # In[128]: data['PREPROCESSED'] = data.CONTENT.apply(lambda text: ' '.join(preprocess(text))) # In[129]: data.head() # In[ ]: #

# ** visualize words for spam and ham ** #

# In[130]: spam_words = reduce(lambda x,y: x+" "+y , data[data.CLASS == 'SPAM'].PREPROCESSED) ham_words = reduce(lambda x,y: x+" "+y , data[data.CLASS == 'HAM'].PREPROCESSED) # In[ ]: # In[131]: from collections import Counter # In[132]: spam_word_freq = Counter(spam_words.split()) ham_word_freq = Counter(ham_words.split()) # In[133]: sw_df= pd.DataFrame(spam_word_freq.most_common(),columns=['word','freq']) sw_df.head() # In[134]: hw_df= pd.DataFrame(ham_word_freq.most_common(),columns=['word','freq']) hw_df.head() # In[135]: fig, ax = plt.subplots(figsize=(12,6)) sw_df[:50].plot(x='word',y='freq',kind='bar',ax = ax) # In[136]: fig, ax = plt.subplots(figsize=(12,6)) hw_df[:50].plot(x='word',y='freq',kind='bar',ax = ax) # In[ ]: #

# Spam/Ham WordCloud #

# In[137]: from wordcloud import WordCloud,STOPWORDS from PIL import Image # In[ ]: # In[138]: alice_mask = np.array(Image.open("alice_mask.png")) # In[139]: stopwords = set(STOPWORDS) # In[140]: wc_spam = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=stopwords) wc_ham = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=stopwords) # In[141]: wc_spam.generate(spam_words) wc_ham.generate(ham_words) # In[142]: # show plt.figure(figsize=(15,12)) plt.imshow(wc_spam, interpolation='bilinear') plt.axis("off") plt.show() # In[143]: # show plt.figure(figsize=(15,12)) plt.imshow(wc_ham, interpolation='bilinear') plt.axis("off") plt.show() #

# ** Bag of Words Model ** #

# In[162]: from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split,cross_val_score # In[ ]: # In[163]: x_train,x_test,y_train,y_test = train_test_split(data.PREPROCESSED,data.CLASS,test_size=0.1,random_state=2017) # In[164]: print x_train.shape,y_test.shape # In[165]: clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) clf.fit(x_train,y_train) cvs = cross_val_score(clf,x_train,y_train,cv=10,verbose=0,n_jobs=4) print "Accuracy: {} +-{} ".format(round(cvs.mean(),2), round(cvs.std(),2)) # In[ ]: # In[ ]: #

# # ** plot the confusion matrix ** # #

# In[169]: import itertools def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Accent): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.figure(figsize=(12,8)) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') # In[171]: from sklearn.metrics import confusion_matrix,classification_report y_true = y_test y_pred = clf.predict(x_test) print "classification report: " print classification_report(y_true,y_pred,target_names=clf.classes_) conf_mat = confusion_matrix(y_true,y_pred) plot_confusion_matrix(conf_mat,clf.classes_,normalize=True) # In[ ]: #

# Pretty neat eh? Without any optmization we are able to achieve 92% accuracy on cross-validation & 90% accuracy on test set #

# # ** Can we do better ? ** #

# In[180]: from sklearn.ensemble import RandomForestClassifier clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=1000,n_jobs=4,random_state=2017))]) clf.fit(x_train,y_train) cvs = cross_val_score(clf,x_train,y_train,cv=10,verbose=0,n_jobs=4,scoring='f1_weighted') print "Accuracy: {} +-{} ".format(round(cvs.mean(),2), round(cvs.std(),2)) # In[181]: from sklearn.metrics import confusion_matrix,classification_report y_true = y_test y_pred = clf.predict(x_test) print "classification report: " print classification_report(y_true,y_pred,target_names=clf.classes_) conf_mat = confusion_matrix(y_true,y_pred) plot_confusion_matrix(conf_mat,clf.classes_,normalize=True) #

# wow! we achieved a whopping 96% accuracy on cross-validation and 97% accuracy on the test-set using a Random Forest of 1000 trees #

#

# We have not removed any stop-words and still achieved great accuracy. An interesting follow-up would be to remove stop-words and check # if it has any impact on the performance. #

#
# In[ ]: