%load_ext watermark %watermark -a 'Sebastian Raschka' -v import pandas as pd df_train = pd.read_csv('../../dataset/training/train_lyrics_1000.csv') df_test = pd.read_csv('../../dataset/validation/valid_lyrics_200.csv') df_train.tail() X_train = df_train['lyrics'] y_train = df_train['mood'] X_test = df_test['lyrics'] y_test = df_test['mood'] # Label encoder import pickle import numpy as np pickle_in = open('./label_encoder.p', 'rb') le = pickle.load(pickle_in) pickle_in.close() print('before: %s ...' %y_train[:5]) y_train = le.transform(y_train) y_test = le.transform(y_test) print('after: %s ...' %y_train[:5]) import pickle stop_words = pickle.load(open('./stopwords.p', 'rb')) semantic_words = pickle.load(open('./whitelist_dicts/semantic_words_py34.p', 'rb')) from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import EnglishStemmer porter = PorterStemmer() snowball = EnglishStemmer() # raw words tokenizer = lambda text: text.split() # words after Porter stemming tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()] # Words after Snowball stemming tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()] # Only words that are in a list of 'positive' or 'negative' words ('whitelist') # http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words] # Porter-stemmed words in whitelist tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words] # Snowball-stemmed words in whitelist tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words] import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from mlxtend.sklearn import DenseTransformer vect_1 = CountVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer) vect_2 = CountVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_porter) vect_3 = CountVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_snowball) vect_4 = CountVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_whitelist) vect_5 = CountVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_porter_wl) vect_6 = CountVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_snowball_wl) vect_7 = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer) vect_8 = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_porter) vect_9 = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_snowball) vect_10 = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_whitelist) vect_11 = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_porter_wl) vect_12 = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=tokenizer_snowball_wl) pipelines = [] vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12] for v in vectorizers: pipelines.append(Pipeline([('vect', v), ('dense', DenseTransformer()), ('clf', RandomForestClassifier(n_estimators=100))])) # done before max_features was set print('Vocabulary sizes\n') labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl', 'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',] for label, v in zip(labels, vectorizers): v.fit(X_train) print('%s: %s' % (label, len(v.vocabulary_))) from sklearn import metrics from sklearn import cross_validation labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl', 'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',] d = {'Data':labels, 'ROC AUC (%)':[],} for i,clf in enumerate(pipelines): scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10) print('clf %s, %s: %s' % (i+1, labels[i], scores.mean()*100)) d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100)) df_perform = pd.DataFrame(d) df_perform = df_perform['ROC AUC (%)'] df_perform.index=(labels) df_perform df_perform.to_csv('./random_forests_data/rand_forest_featextr_1.csv', index=False) %matplotlib inline from sklearn.metrics import roc_curve, auc import seaborn as sns import numpy as np import matplotlib.pyplot as plt from sklearn.cross_validation import KFold from scipy import interp sns.set() sns.set_style("whitegrid") classifier = Pipeline([('vect', TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), max_features = 5000, tokenizer=lambda text: [porter.stem(word) for word in text.split()] )), ('dense', DenseTransformer()), ('clf', RandomForestClassifier(n_estimators=100))]) cv = KFold(y_train.shape[0], n_folds=10, random_state=123) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.tight_layout() plt.savefig('./random_forests_images/roc_tfidf_porter_1.eps', dpi=300) plt.legend(loc="lower right") plt.show() vect = TfidfVectorizer(binary=False, stop_words=stop_words, ngram_range=(1,1), preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()), tokenizer=lambda text: [porter.stem(word) for word in text.split()]) from sklearn.metrics import roc_curve, auc from sklearn import cross_validation pipe_1 = Pipeline([ ('vect', vect), ('dense', DenseTransformer()), ('clf', RandomForestClassifier(n_estimators=50))]) pipe_2 = Pipeline([ ('vect', vect), ('dense', DenseTransformer()), ('clf', RandomForestClassifier(n_estimators=100))]) pipe_3 = Pipeline([ ('vect', vect), ('dense', DenseTransformer()), ('clf', RandomForestClassifier(n_estimators=200))]) pipe_4 = Pipeline([ ('vect', vect), ('dense', DenseTransformer()), ('clf', RandomForestClassifier(n_estimators=400))]) labels = [50, 100, 200, 400] for i,clf in enumerate([pipe_1, pipe_2, pipe_3, pipe_4]): scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10) print('clf %s, %s: %0.2f (+/- %0.2f)' % (i+1, labels[i], scores.mean()*100, scores.std()*100)) X_train_feat = vect.fit_transform(X_train, y_train) X_train_feat = X_train_feat.toarray() from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report clf_2 = RandomForestClassifier(n_estimators=50) tuned_parameters = [ {'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'log2', 'sqrt'], 'min_samples_split':[2,3], 'min_samples_leaf':[1,2]}, ] grid_search_1 = GridSearchCV(clf_2, tuned_parameters, n_jobs=1, scoring='roc_auc', cv=10 ) grid_search_1.fit(X_train_feat, X_train_feat) print("Best parameters set found on development set:") print() print(grid_search_1.best_estimator_) print() print("Grid scores on development set:") print() for params, mean_score, scores in grid_search_1.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) # Custom scorer methods to account for positive-negative class labels from sklearn import metrics # `pos_label` for positive class, since we have sad=1, happy=0 acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0) rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0) f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0) auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True) labels = ['Train CountVec', 'Train CountVec porter', 'Train CountVec snowball', 'Train CountVec wl', 'Train CountVec porter+wl','Train CountVec snowball+wl', 'Train TfidfVec', 'Train TfidfVec porter', 'Train TfidfVec snowball', 'Train TfidfVec wl', 'Train TfidfVec porter+wl','Train TfidfVec snowball+wl', 'Test CountVec', 'Test CountVec porter', 'Test CountVec snowball', 'Test CountVec wl', 'Test CountVec porter+wl','Test CountVec snowball+wl', 'Test TfidfVec', 'Test TfidfVec porter', 'Test TfidfVec snowball', 'Test TfidfVec wl', 'Test TfidfVec porter+wl','Test TfidfVec snowball+wl',] d = {'Data':labels, 'ACC (%)':[], 'PRE (%)':[], 'REC (%)':[], 'F1 (%)':[], 'ROC AUC (%)':[], } for clf in pipelines: clf.fit(X_train, y_train) for clf in pipelines: d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train)) d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train)) d['REC (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train)) d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train)) d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train)) for clf in pipelines: d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test)) d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test)) d['REC (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test)) d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test)) d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test)) pd.set_option('precision', 2) df_perform = pd.DataFrame(d) df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']] df_perform.index=(labels) df_perform = df_perform*100 df_perform = np.round(df_perform, decimals=2) df_perform df_perform.to_csv('./random_forests_data/clf_performance.csv', index_label=False, float_format='%2.2f')