%load_ext watermark
%watermark -a 'Sebastian Raschka' -v
Sebastian Raschka CPython 3.4.2 IPython 2.3.0
import pandas as pd
df_train = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df_test = pd.read_csv('../../dataset/validation/valid_lyrics_200.csv')
df_train.tail()
file | artist | title | lyrics | mood | year | |
---|---|---|---|---|---|---|
995 | TRBIGRY128F42597B3.h5 | Sade | All About Our Love | Its all about our love\nSo shall it be forever... | sad | 2000 |
996 | TRBIIEU128F9307C88.h5 | New Found Glory | Don't Let Her Pull You Down | It's time that I rain on your parade\nWatch as... | happy | 2009 |
997 | TRBIIJY12903CE4755.h5 | Mindy McCready | Ten Thousand Angels | Speakin of the devil\nLook who just walked in\... | happy | 1996 |
998 | TRBIIOT128F423C594.h5 | Joy Division | Leaders Of Men | Born from some mother's womb\nJust like any ot... | sad | 1978 |
999 | TRBIJYB128F14AE326.h5 | Seventh Day Slumber | Shattered Life | This wanting more from me is tearing me, it's ... | sad | 2005 |
X_train = df_train['lyrics']
y_train = df_train['mood']
X_test = df_test['lyrics']
y_test = df_test['mood']
# Label encoder
import pickle
import numpy as np
pickle_in = open('./label_encoder.p', 'rb')
le = pickle.load(pickle_in)
pickle_in.close()
print('before: %s ...' %y_train[:5])
y_train = le.transform(y_train)
y_test = le.transform(y_test)
print('after: %s ...' %y_train[:5])
before: 0 sad 1 happy 2 sad 3 happy 4 sad Name: mood, dtype: object ... after: [1 0 1 0 1] ...
import pickle
stop_words = pickle.load(open('./stopwords.p', 'rb'))
semantic_words = pickle.load(open('./whitelist_dicts/semantic_words_py34.p', 'rb'))
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
porter = PorterStemmer()
snowball = EnglishStemmer()
# raw words
tokenizer = lambda text: text.split()
# words after Porter stemming
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]
# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]
# Only words that are in a list of 'positive' or 'negative' words ('whitelist')
# http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]
# Porter-stemmed words in whitelist
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]
# Snowball-stemmed words in whitelist
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from mlxtend.sklearn import DenseTransformer
vect_1 = CountVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer)
vect_2 = CountVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_porter)
vect_3 = CountVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_snowball)
vect_4 = CountVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_whitelist)
vect_5 = CountVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_porter_wl)
vect_6 = CountVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_snowball_wl)
vect_7 = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer)
vect_8 = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_porter)
vect_9 = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_snowball)
vect_10 = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_whitelist)
vect_11 = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_porter_wl)
vect_12 = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=tokenizer_snowball_wl)
pipelines = []
vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12]
for v in vectorizers:
pipelines.append(Pipeline([('vect', v),
('dense', DenseTransformer()),
('clf', RandomForestClassifier(n_estimators=100))]))
# done before max_features was set
print('Vocabulary sizes\n')
labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]
for label, v in zip(labels, vectorizers):
v.fit(X_train)
print('%s: %s' % (label, len(v.vocabulary_)))
Vocabulary sizes CountVec: 11378 CountVec porter: 8551 CountVec snowball: 8528 CountVec wl: 1666 CountVec porter+wl: 1349 CountVec snowball+wl: 1332 TfidfVec: 11378 TfidfVec porter: 8551 TfidfVec snowball: 8528 TfidfVec wl: 1666 TfidfVec porter+wl: 1349 TfidfVec snowball+wl: 1332
from sklearn import metrics
from sklearn import cross_validation
labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]
d = {'Data':labels,
'ROC AUC (%)':[],}
for i,clf in enumerate(pipelines):
scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
print('clf %s, %s: %s' % (i+1, labels[i], scores.mean()*100))
d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
clf 1, CountVec: 72.4267184835 clf 2, CountVec porter: 72.5908238226 clf 3, CountVec snowball: 74.2652990948 clf 4, CountVec wl: 68.4214056146 clf 5, CountVec porter+wl: 68.7880362062 clf 6, CountVec snowball+wl: 69.3732388823 clf 7, TfidfVec: 70.9020529975 clf 8, TfidfVec porter: 73.5260330579 clf 9, TfidfVec snowball: 72.5986685032 clf 10, TfidfVec wl: 66.7347697757 clf 11, TfidfVec porter+wl: 69.1317821068 clf 12, TfidfVec snowball+wl: 68.3577298964
df_perform = pd.DataFrame(d)
df_perform = df_perform['ROC AUC (%)']
df_perform.index=(labels)
df_perform
CountVec 72.43 (+/- 4.18) CountVec porter 72.59 (+/- 6.05) CountVec snowball 74.27 (+/- 5.00) CountVec wl 68.42 (+/- 4.92) CountVec porter+wl 68.79 (+/- 4.48) CountVec snowball+wl 69.37 (+/- 4.69) TfidfVec 70.90 (+/- 4.03) TfidfVec porter 73.53 (+/- 4.21) TfidfVec snowball 72.60 (+/- 4.61) TfidfVec wl 66.73 (+/- 5.91) TfidfVec porter+wl 69.13 (+/- 5.08) TfidfVec snowball+wl 68.36 (+/- 6.03) Name: ROC AUC (%), dtype: object
df_perform.to_csv('./random_forests_data/rand_forest_featextr_1.csv', index=False)
%matplotlib inline
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from scipy import interp
sns.set()
sns.set_style("whitegrid")
classifier = Pipeline([('vect', TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
max_features = 5000,
tokenizer=lambda text: [porter.stem(word) for word in text.split()]
)),
('dense', DenseTransformer()),
('clf', RandomForestClassifier(n_estimators=100))])
cv = KFold(y_train.shape[0], n_folds=10, random_state=123)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train, test) in enumerate(cv):
probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.tight_layout()
plt.savefig('./random_forests_images/roc_tfidf_porter_1.eps', dpi=300)
plt.legend(loc="lower right")
plt.show()
vect = TfidfVectorizer(binary=False,
stop_words=stop_words,
ngram_range=(1,1),
preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
tokenizer=lambda text: [porter.stem(word) for word in text.split()])
from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation
pipe_1 = Pipeline([
('vect', vect),
('dense', DenseTransformer()),
('clf', RandomForestClassifier(n_estimators=50))])
pipe_2 = Pipeline([
('vect', vect),
('dense', DenseTransformer()),
('clf', RandomForestClassifier(n_estimators=100))])
pipe_3 = Pipeline([
('vect', vect),
('dense', DenseTransformer()),
('clf', RandomForestClassifier(n_estimators=200))])
pipe_4 = Pipeline([
('vect', vect),
('dense', DenseTransformer()),
('clf', RandomForestClassifier(n_estimators=400))])
labels = [50, 100, 200, 400]
for i,clf in enumerate([pipe_1, pipe_2, pipe_3, pipe_4]):
scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
print('clf %s, %s: %0.2f (+/- %0.2f)' % (i+1, labels[i], scores.mean()*100, scores.std()*100))
clf 1, 50: 71.31 (+/- 6.53) clf 2, 100: 71.62 (+/- 4.76) clf 3, 200: 72.51 (+/- 3.81) clf 4, 400: 74.77 (+/- 3.80)
X_train_feat = vect.fit_transform(X_train, y_train)
X_train_feat = X_train_feat.toarray()
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
clf_2 = RandomForestClassifier(n_estimators=50)
tuned_parameters = [
{'criterion': ['gini', 'entropy'],
'max_features': ['auto', 'log2', 'sqrt'],
'min_samples_split':[2,3],
'min_samples_leaf':[1,2]},
]
grid_search_1 = GridSearchCV(clf_2,
tuned_parameters,
n_jobs=1,
scoring='roc_auc',
cv=10
)
grid_search_1.fit(X_train_feat, X_train_feat)
print("Best parameters set found on development set:")
print()
print(grid_search_1.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in grid_search_1.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() / 2, params))
# Custom scorer methods to account for positive-negative class labels
from sklearn import metrics
# `pos_label` for positive class, since we have sad=1, happy=0
acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)
auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True)
labels = ['Train CountVec', 'Train CountVec porter', 'Train CountVec snowball', 'Train CountVec wl',
'Train CountVec porter+wl','Train CountVec snowball+wl',
'Train TfidfVec', 'Train TfidfVec porter', 'Train TfidfVec snowball', 'Train TfidfVec wl',
'Train TfidfVec porter+wl','Train TfidfVec snowball+wl',
'Test CountVec', 'Test CountVec porter', 'Test CountVec snowball', 'Test CountVec wl',
'Test CountVec porter+wl','Test CountVec snowball+wl',
'Test TfidfVec', 'Test TfidfVec porter', 'Test TfidfVec snowball', 'Test TfidfVec wl',
'Test TfidfVec porter+wl','Test TfidfVec snowball+wl',]
d = {'Data':labels,
'ACC (%)':[],
'PRE (%)':[],
'REC (%)':[],
'F1 (%)':[],
'ROC AUC (%)':[],
}
for clf in pipelines:
clf.fit(X_train, y_train)
for clf in pipelines:
d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train))
d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train))
d['REC (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train))
d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train))
d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train))
for clf in pipelines:
d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test))
d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test))
d['REC (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test))
d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test))
d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test))
pd.set_option('precision', 2)
df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform = df_perform*100
df_perform = np.round(df_perform, decimals=2)
df_perform
ACC (%) | PRE (%) | REC (%) | F1 (%) | ROC AUC (%) | |
---|---|---|---|---|---|
Train CountVec | 99.8 | 99.8 | 99.8 | 99.8 | 99.8 |
Train CountVec porter | 99.8 | 99.8 | 99.8 | 99.8 | 99.8 |
Train CountVec snowball | 99.8 | 99.8 | 99.8 | 99.8 | 99.8 |
Train CountVec wl | 99.5 | 100.0 | 98.9 | 99.4 | 99.4 |
Train CountVec porter+wl | 99.5 | 99.8 | 99.1 | 99.4 | 99.5 |
Train CountVec snowball+wl | 99.5 | 99.8 | 99.1 | 99.4 | 99.5 |
Train TfidfVec | 99.8 | 99.5 | 100.0 | 99.8 | 99.8 |
Train TfidfVec porter | 99.8 | 100.0 | 99.5 | 99.8 | 99.8 |
Train TfidfVec snowball | 99.8 | 99.8 | 99.8 | 99.8 | 99.8 |
Train TfidfVec wl | 99.4 | 99.5 | 99.1 | 99.3 | 99.4 |
Train TfidfVec porter+wl | 99.4 | 99.5 | 99.1 | 99.3 | 99.4 |
Train TfidfVec snowball+wl | 99.4 | 100.0 | 98.7 | 99.3 | 99.3 |
Test CountVec | 61.5 | 70.6 | 45.7 | 55.5 | 62.3 |
Test CountVec porter | 61.5 | 72.6 | 42.9 | 53.9 | 62.5 |
Test CountVec snowball | 68.0 | 80.6 | 51.4 | 62.8 | 68.9 |
Test CountVec wl | 61.0 | 70.2 | 44.8 | 54.6 | 61.9 |
Test CountVec porter+wl | 62.5 | 70.3 | 49.5 | 58.1 | 63.2 |
Test CountVec snowball+wl | 62.0 | 69.3 | 49.5 | 57.8 | 62.7 |
Test TfidfVec | 60.5 | 71.0 | 41.9 | 52.7 | 61.5 |
Test TfidfVec porter | 63.5 | 77.6 | 42.9 | 55.2 | 64.6 |
Test TfidfVec snowball | 61.5 | 74.1 | 41.0 | 52.8 | 62.6 |
Test TfidfVec wl | 56.5 | 62.5 | 42.9 | 50.9 | 57.2 |
Test TfidfVec porter+wl | 61.0 | 68.5 | 47.6 | 56.2 | 61.7 |
Test TfidfVec snowball+wl | 63.5 | 69.0 | 55.2 | 61.4 | 63.9 |
df_perform.to_csv('./random_forests_data/clf_performance.csv', index_label=False, float_format='%2.2f')