#!/usr/bin/env python # coding: utf-8 # # Classifying Toxic Comments # # by: Keith Qu # # Some natural language classification of toxic comments using logistic regression and also keras (running on tensorflow). This is a very broad run through ranging from basic linear methods, to modified linear methods, to deep learning. # # Methods include logistic regression, NB-SVM, and CNN and RNN (bidirectional LSTM) with Keras on a TensorFlow backend. # In[3]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import re, string from collections import Counter from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc from scipy import sparse from sklearn.model_selection import train_test_split get_ipython().run_line_magic('matplotlib', 'inline') # ## First look with logit # Toxicity categories: # # The labels are fairly self-explanatory, but it also seems like there should be a lot of overlap between the categories, since everything that qualifies as severe_toxic, obscene, threat, insult or identity_hate should also be regular toxic as well. # # It makes sense that the categories are not exclusive. So we can treat them like 6 different classification problems. # In[4]: train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') # In[5]: train.tail() # Let's take a look at some of our test comments. # In[6]: test.head() # In[7]: test.loc[0].comment_text # This looks like we have some obscenity and insult, combined with a dash of identity hate. # # They are also completely wrong, since 50 Cent $>$ Ja Rule any day. Well, maybe not his last album... # ### Missing data? # # Only a little. # In[8]: test.fillna(' ',inplace=True) # ### Vectorizing the comments # # We'll do it by words and by character. Internet comments are a cesspool, and there are character ngrams that can have toxic meanings. Maybe we can also combine them. # In[9]: def tok(s): return re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])').sub(r' \1 ',s).split() # In[10]: words = TfidfVectorizer(ngram_range=(1,2), lowercase=True, analyzer='word',stop_words='english',tokenizer=tok, min_df=3,max_df=0.9, sublinear_tf=1, smooth_idf=1, dtype=np.float32, strip_accents='unicode') chars = TfidfVectorizer(ngram_range=(1,5), lowercase=True, analyzer='char',min_df=3, max_df=0.9, sublinear_tf=1,smooth_idf=1,dtype=np.float32,) # In[11]: train_words = words.fit_transform(train['comment_text']) train_chars = chars.fit_transform(train['comment_text']) # ### Only words # In[12]: X = sparse.csr_matrix(train_words) cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate'] y = train[cols] # In[13]: X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10101) # In[14]: logit = LogisticRegression(C=4, dual=True) pred = np.zeros((X_test.shape[0],y_test.shape[1])) for i,c in enumerate(cols): logit.fit(X_train,y_train[c]) pred[:,i] = logit.predict(X_test) # In[15]: for i,c in enumerate(cols): print('Confusion matrix for', c) print(confusion_matrix(y_test[c],pred[:,i])) # ### Only characters # In[16]: X_c = sparse.csr_matrix(train_chars) X_train,X_test,y_train,y_test = train_test_split(X_c,y,test_size=0.3,random_state=10101) # In[17]: pred = np.zeros((X_test.shape[0],y_test.shape[1])) for i,c in enumerate(cols): logit.fit(X_train,y_train[c]) pred[:,i] = logit.predict(X_test) # In[18]: for i,c in enumerate(cols): print('Confusion matrix for', c) print(confusion_matrix(y_test[c],pred[:,i])) # Characterwise vectorization appears to have better results with ngrams of up to size 5, but this could vary with different splits. # ### Combine words and characters # Horizontal stacking the word and character sets to create a larger blended dataset. # In[19]: X2 = sparse.hstack([train_words,train_chars]) # In[20]: X_train,X_test,y_train,y_test = train_test_split(X2,y,test_size=0.3,random_state=10101) # In[21]: pred = np.zeros((X_test.shape[0],y_test.shape[1])) for i,c in enumerate(cols): logit.fit(X_train,y_train[c]) pred[:,i] = logit.predict(X_test) # In[22]: for i,c in enumerate(cols): print('Confusion matrix for', c) print(confusion_matrix(y_test[c],pred[:,i])) # So putting character and string combinations together seem to give better results than they are separately. However, we can see that the model does best in identifying toxic, obscene and insult comments, and these are the ones that that are likely to have specific keywords associated with them. There would appear to be heavy subjectivity when it comes to what exactly constitutes severe toxicity. Threats are very context-sensitive and identity hate is a lot easier to identify for some groups than others. # # It's also extremely memory-intensive for a machine with a very normal 16 GB of RAM. # ## NB-SVM # # Wang & Manning (2012) find that Multinomial Naive Bayes performs better at classifying smaller snippets of text, while SVM is superior with full-length text. By combining the two models with linear interpolation, they create a new model that is robust for a wide variety of text. # # There is a very helpful Python implementation by Jeremy Howard. # In[23]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import string, re from collections import Counter from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc from scipy import sparse from sklearn.model_selection import train_test_split get_ipython().run_line_magic('matplotlib', 'inline') # In[24]: train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate'] test.fillna(' ',inplace=True) # In[25]: # here's the main part of the implementation by jhoward mentioned above def pr(y_i, y): p = train_words[y == y_i].sum(0) return (p+1)/((y==y_i).sum()+1) # Get rid of the punctuation. Again, thanks to jhoward for this... def tok(s): return re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])').sub(r' \1 ',s).split() # In[26]: words = TfidfVectorizer(ngram_range=(1,2), lowercase=True, analyzer='word',stop_words='english',tokenizer=tok, min_df=3,max_df=0.9, sublinear_tf=1, smooth_idf=1, use_idf=1, strip_accents='unicode') # In[27]: train_words = words.fit_transform(train['comment_text']) test_words = words.transform(test['comment_text']) # In[28]: pred = np.zeros((test.shape[0],len(cols))) for i,c in enumerate(cols): logit = LogisticRegression(C=4, dual=True) r = np.log(pr(1,train[c].values)/pr(0,train[c].values)) X_nb = train_words.multiply(r) logit.fit(X_nb,train[c].values) pred[:,i] = logit.predict_proba(test_words.multiply(r))[:,1] # In[29]: submission = pd.read_csv('sample_submission.csv') submission[cols] = pred submission.to_csv('submission.csv',index=False) # This gives a score in the 0.07s (mean column-wise log loss), which is on the high side. But it was quick and painless; there wasn't any lemmatization, feature engineering, using toxic word dictionaries, spellchecking, or using existing repositories of vectorized text. So there's a lot of room for improvement. # In[30]: submission.head() # The first entry (0) is from the enlightened Ja Rule supporter shown above. We have detected the obscenity and insult (and so it is definitely toxic), but it's a bit weak on the identity hate measure. To be fair, calling someone a "fuckin white boy" ranks low on the identity hate ladder, but arguably it still should count. We don't really know what it's true classification is at this point. # ## Keras/TensorFlow # # The Keras API is a nice way to access TensorFlow, which will be needed to create convolutional (CNN) and recurrent neural networks (RNNs). # In[31]: import pandas as pd import numpy as np from nltk.corpus import stopwords from nltk import WordNetLemmatizer from nltk import pos_tag, word_tokenize from keras.models import Sequential from keras.layers import (Dense, Dropout, Input, LSTM, Activation, Flatten, Convolution1D, MaxPooling1D, Bidirectional, GlobalMaxPooling1D, Embedding, BatchNormalization, SpatialDropout1D) from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc # In[32]: train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate'] test.fillna(' ',inplace=True) # Going to convert everything to lower case, remove stopwords, lemmatize words (get the root), and convert text into batch sequences of 200 length to run through the learning models. # # # In[42]: def clean_up(t): t = t.strip().lower() words = t.split() # first get rid of the stopwords, or a lemmatized stopword might not # be recognized as a stopword imp_words = ' '.join(w for w in words if w not in set(stopwords.words('english'))) # lemmatize based on adjectives (J), verbs (V), nouns (N) and adverbs (R) to # return only the base words (as opposed to stemming which can return # non-words). e.g. ponies -> poni with stemming, and pony with lemmatizing final_words = '' lemma = WordNetLemmatizer() for (w,tag) in pos_tag(word_tokenize(imp_words)): if tag.startswith('J'): final_words += ' '+ lemma.lemmatize(w, pos='a') elif tag.startswith('V'): final_words += ' '+ lemma.lemmatize(w, pos='v') elif tag.startswith('N'): final_words += ' '+ lemma.lemmatize(w, pos='n') elif tag.startswith('R'): final_words += ' '+ lemma.lemmatize(w, pos='r') else: final_words += ' '+ w return final_words # what a great name. do_stuff def do_stuff (df): text = df['comment_text'].copy() # First get rid of anything that's not a letter. This may not be the greatest idea, since # on3 c4n 3451ly substitute numbers in for letters, but keep it like this for now. text.replace(to_replace={r'[^\x00-\x7F]':' '},inplace=True,regex=True) text.replace(to_replace={r'[^a-zA-Z]': ' '},inplace=True,regex=True) # Then lower case, tokenize and lemmatize text = text.apply(lambda t:clean_up(t)) return text def tok_seq (train,test): tok = Tokenizer(num_words=100000) tok.fit_on_texts(X_train) # set our max text length to 200 characters seq_train = tok.texts_to_sequences(train) seq_test = tok.texts_to_sequences(test) data_train = pad_sequences(seq_train,maxlen=200) data_test = pad_sequences(seq_test,maxlen=200) return data_train,data_test # Convolution model with 25% dropouts to help with generalization. # In[48]: def seq_model (X_train, y_train, test, val='no'): model = Sequential() model.add(Embedding(100000,50,input_length=200)) model.add(Dropout(0.25)) model.add(Convolution1D(250, activation='relu',padding='valid',kernel_size=3)) model.add(GlobalMaxPooling1D()) model.add(Dense(250)) model.add(Dropout(0.25)) model.add(Activation('relu')) # A sigmoid ensures a bounded solution in (0,1) model.add(Dense(6,activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # batch_size=1000 seems to be the limit of my 2gb GTX 960m # like with all predictive modeling, there is an under/overfitting # tradeoff between too few epochs and too many if val == 'no': model.fit(X_train,y_train,batch_size=1000,epochs=5) else: model.fit(X_train,y_train,batch_size=1000,epochs=5,validation_split=0.1) pred = model.predict(test) return pred # Bidirectional LSTM model with similar dropouts. # In[51]: def bidirect_model (X_train, y_train, test, val='no'): model=Sequential() model.add(Embedding(100000,100,input_length=200)) model.add(Bidirectional(LSTM(50, return_sequences=True))) model.add(GlobalMaxPooling1D()) model.add(Dropout(0.25)) model.add(Dense(250)) model.add(Dropout(0.25)) model.add(Activation('relu')) model.add(Dense(6,activation='sigmoid')) model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) if val == 'no': model.fit(X_train,y_train,batch_size=1000,epochs=4) else: model.fit(X_train,y_train,batch_size=1000,epochs=4,validation_split=0.1) pred = model.predict(test) return pred # ### Train/Validation # # First let's take a look at results with a train test split. # In[36]: X_train = do_stuff(train) # In[37]: cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate'] y_train = train[cols].values # In[38]: Xt, Xv, yt, yv = train_test_split(X_train, y_train, test_size=0.2, random_state=11) # In[43]: Xt, Xv = tok_seq(Xt,Xv) # In[49]: pred_seq = seq_model (Xt, yt, Xv) # In[52]: pred_bid = bidirect_model(Xt,yt,Xv) # Unsurprisingly, bidirectional adds a hefty amount of computing time. # # Using a GPU for computation is kind of like reading Playboy for the articles, where it might seem questionable at first but GPUs are really good for deep learning and every now and then Playboy has a great article about the military industrial complex. # In[53]: roc_auc_score(yv, pred_seq) # In[54]: roc_auc_score(yv, pred_bid) # In[55]: for i,c in enumerate(cols): print ('Correlation between results of', c) print(np.corrcoef(pred_bid[:,i],pred_seq[:,i])[0,1]) # The two sets of predictions are fairly highly correlated for our validation split, but we can still try mean ensembling the results. # In[ ]: roc_auc_score(yv,0.5*(pred_seq+pred_bid)) # ### Testing on the public test set # In[56]: X_test = do_stuff(test) # In[57]: X_train,X_test=tok_seq(X_train,X_test) # In[58]: y_train = train[['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']].values # In[59]: finalpred_seq = seq_model(X_train, y_train, X_test,val='yes') # In[60]: finalpred_bid = bidirect_model (X_train, y_train, X_test,val='yes') # In[61]: submission1 = pd.read_csv('sample_submission.csv') submission1[cols] = finalpred_seq submission1.to_csv('submission1.csv',index=False) # In[62]: submission2 = pd.read_csv('sample_submission.csv') submission2[cols] = finalpred_bid submission2.to_csv('submission2.csv',index=False) # In[63]: submission3 = pd.read_csv('sample_submission.csv') submission3[cols] = 0.5*(submission1[cols] + submission2[cols]) submission3.to_csv('submission3.csv',index=False) # ### One more... # In[74]: def onemore_model (X_train, y_train, test, val='no'): model=Sequential() model.add(Embedding(100000,100,input_length=200)) model.add(SpatialDropout1D(0.25)) model.add(GlobalMaxPooling1D()) model.add(BatchNormalization()) model.add(Dense(128)) model.add(Dropout(0.5)) model.add(Dense(6,activation='sigmoid')) model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) if val == 'no': model.fit(X_train,y_train,batch_size=1000,epochs=5) else: model.fit(X_train,y_train,batch_size=1000,epochs=5,validation_split=0.1) pred = model.predict(test) return pred # In[75]: finalpred_om = onemore_model (X_train, y_train, X_test,val='yes') # In[76]: submission4 = pd.read_csv('sample_submission.csv') submission4[cols] = finalpred_om submission4.to_csv('submission4.csv',index=False) # This last one scored 0.067. However, by dividing the predictions matrix by 1.12, this score actually improves to 0.063. This is due to the unbalanced nature of the dataset, and has led to some discussion about switching to an AUC scoring system. Regardless, there is still a lot of room for improvement. But I think that getting within earshot of the top 25% isn't too shabby considering I have about 4 days worth of natural language processing experience. # ## Conclusion # # The obvious next step would be to use existing vectorized co-occurence dictionaries such as GloVe or Facebook fastText. Spell checking and toxic word dictionaries may also be helpful. Possible features that can be created include the use of all caps, prevalence of symbols within the body of text or exlamation and question marks. These can be determined prior to detokenization/lemmatization/forced lowercasing. Comment length can also be useful; anecdotally, at times there seems to be a slight corelation between the length of a comment and how angry its writer is. Early stoppage can also be incorporated to reduce overfitting. # # These will almost certainly be necessary for significant score improvements.