In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v
Sebastian Raschka 

CPython 3.4.2
IPython 2.3.0

Music Mood Classification Using Random Forests

Sections



Reading the Training Dataset

In [13]:
import pandas as pd

df_train = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df_test = pd.read_csv('../../dataset/validation/valid_lyrics_200.csv')

df_train.tail()
Out[13]:
file artist title lyrics mood year
995 TRBIGRY128F42597B3.h5 Sade All About Our Love Its all about our love\nSo shall it be forever... sad 2000
996 TRBIIEU128F9307C88.h5 New Found Glory Don't Let Her Pull You Down It's time that I rain on your parade\nWatch as... happy 2009
997 TRBIIJY12903CE4755.h5 Mindy McCready Ten Thousand Angels Speakin of the devil\nLook who just walked in\... happy 1996
998 TRBIIOT128F423C594.h5 Joy Division Leaders Of Men Born from some mother's womb\nJust like any ot... sad 1978
999 TRBIJYB128F14AE326.h5 Seventh Day Slumber Shattered Life This wanting more from me is tearing me, it's ... sad 2005
In [14]:
X_train = df_train['lyrics']
y_train = df_train['mood']

X_test = df_test['lyrics']
y_test = df_test['mood']
In [15]:
# Label encoder

import pickle
import numpy as np

pickle_in = open('./label_encoder.p', 'rb')
le = pickle.load(pickle_in)
pickle_in.close()

print('before: %s ...' %y_train[:5])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

print('after: %s ...' %y_train[:5])
before: 0      sad
1    happy
2      sad
3    happy
4      sad
Name: mood, dtype: object ...
after: [1 0 1 0 1] ...





Text Preprocessing

In [5]:
import pickle
stop_words = pickle.load(open('./stopwords.p', 'rb'))
semantic_words = pickle.load(open('./whitelist_dicts/semantic_words_py34.p', 'rb'))



Transform texts into bag of words models - Trying different tokenizers

In [6]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer

porter = PorterStemmer()
snowball = EnglishStemmer()

# raw words
tokenizer = lambda text: text.split()

# words after Porter stemming 
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]

# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]

# Only words that are in a list of 'positive' or 'negative' words ('whitelist')
# http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]

# Porter-stemmed words in whitelist
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]

# Snowball-stemmed words in whitelist
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]



Looking at vocabulary sizes

In [7]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from mlxtend.sklearn import DenseTransformer

vect_1 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_2 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_3 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)  

vect_4 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)  

vect_5 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_6 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)

vect_7 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_8 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_9 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)

vect_10 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)    

vect_11 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_12 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)


pipelines = []
vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12]
for v in vectorizers:
    pipelines.append(Pipeline([('vect', v),
                               ('dense', DenseTransformer()),
                               ('clf', RandomForestClassifier(n_estimators=100))]))
In [17]:
# done before max_features was set

print('Vocabulary sizes\n')
labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
          'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]

for label, v in zip(labels, vectorizers):
    v.fit(X_train)
    print('%s: %s' % (label, len(v.vocabulary_)))
Vocabulary sizes

CountVec: 11378
CountVec porter: 8551
CountVec snowball: 8528
CountVec wl: 1666
CountVec porter+wl: 1349
CountVec snowball+wl: 1332
TfidfVec: 11378
TfidfVec porter: 8551
TfidfVec snowball: 8528
TfidfVec wl: 1666
TfidfVec porter+wl: 1349
TfidfVec snowball+wl: 1332



Model Selection

Feature Extraction - Cross Validation Error

In [19]:
from sklearn import metrics
from sklearn import cross_validation

labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
          'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]



d = {'Data':labels,
     'ROC AUC (%)':[],}

for i,clf in enumerate(pipelines):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
    print('clf %s, %s: %s' % (i+1, labels[i], scores.mean()*100))
    d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))
clf 1, CountVec: 72.4267184835
clf 2, CountVec porter: 72.5908238226
clf 3, CountVec snowball: 74.2652990948
clf 4, CountVec wl: 68.4214056146
clf 5, CountVec porter+wl: 68.7880362062
clf 6, CountVec snowball+wl: 69.3732388823
clf 7, TfidfVec: 70.9020529975
clf 8, TfidfVec porter: 73.5260330579
clf 9, TfidfVec snowball: 72.5986685032
clf 10, TfidfVec wl: 66.7347697757
clf 11, TfidfVec porter+wl: 69.1317821068
clf 12, TfidfVec snowball+wl: 68.3577298964
In [20]:
df_perform = pd.DataFrame(d)
df_perform = df_perform['ROC AUC (%)']
df_perform.index=(labels)
df_perform
Out[20]:
CountVec                72.43 (+/- 4.18)
CountVec porter         72.59 (+/- 6.05)
CountVec snowball       74.27 (+/- 5.00)
CountVec wl             68.42 (+/- 4.92)
CountVec porter+wl      68.79 (+/- 4.48)
CountVec snowball+wl    69.37 (+/- 4.69)
TfidfVec                70.90 (+/- 4.03)
TfidfVec porter         73.53 (+/- 4.21)
TfidfVec snowball       72.60 (+/- 4.61)
TfidfVec wl             66.73 (+/- 5.91)
TfidfVec porter+wl      69.13 (+/- 5.08)
TfidfVec snowball+wl    68.36 (+/- 6.03)
Name: ROC AUC (%), dtype: object
In [21]:
df_perform.to_csv('./random_forests_data/rand_forest_featextr_1.csv', index=False)



ROC Curve

In [22]:
%matplotlib inline
In [24]:
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

classifier = Pipeline([('vect',   TfidfVectorizer(binary=False,
                                             stop_words=stop_words,
                                             ngram_range=(1,1),
                                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                                             max_features = 5000,
                                             tokenizer=lambda text: [porter.stem(word) for word in text.split()]
                )),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])


cv = KFold(y_train.shape[0], n_folds=10, random_state=123)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.tight_layout()
plt.savefig('./random_forests_images/roc_tfidf_porter_1.eps', dpi=300)
plt.legend(loc="lower right")

plt.show()





Hyperparameter Tuning

In [8]:
vect = TfidfVectorizer(binary=False,
                       stop_words=stop_words,
                       ngram_range=(1,1),
                       preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                       tokenizer=lambda text: [porter.stem(word) for word in text.split()])



Effect of the number of estimators

In [10]:
from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation

pipe_1 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=50))])

pipe_2 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])

pipe_3 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=200))])

pipe_4 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=400))])

labels = [50, 100, 200, 400]

for i,clf in enumerate([pipe_1, pipe_2, pipe_3, pipe_4]):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
    print('clf %s, %s: %0.2f (+/- %0.2f)' % (i+1, labels[i], scores.mean()*100, scores.std()*100))
clf 1, 50: 71.31 (+/- 6.53)
clf 2, 100: 71.62 (+/- 4.76)
clf 3, 200: 72.51 (+/- 3.81)
clf 4, 400: 74.77 (+/- 3.80)



GridSearch

In [12]:
X_train_feat = vect.fit_transform(X_train, y_train)
X_train_feat = X_train_feat.toarray()
In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report


clf_2 = RandomForestClassifier(n_estimators=50)


tuned_parameters = [
  {'criterion': ['gini', 'entropy'], 
   'max_features': ['auto', 'log2', 'sqrt'],
   'min_samples_split':[2,3], 
   'min_samples_leaf':[1,2]},
 ]


grid_search_1 = GridSearchCV(clf_2, 
                           tuned_parameters, 
                           n_jobs=1, 
                           scoring='roc_auc',
                           cv=10
                )

grid_search_1.fit(X_train_feat, X_train_feat)

print("Best parameters set found on development set:")
print()
print(grid_search_1.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in grid_search_1.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
            % (mean_score, scores.std() / 2, params))



Validation

In [16]:
# Custom scorer methods to account for positive-negative class labels

from sklearn import metrics

# `pos_label` for positive class, since we have sad=1, happy=0

acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)
auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True)
In [21]:
labels = ['Train CountVec', 'Train CountVec porter', 'Train CountVec snowball', 'Train CountVec wl', 
          'Train CountVec porter+wl','Train CountVec snowball+wl',
          'Train TfidfVec', 'Train TfidfVec porter', 'Train TfidfVec snowball', 'Train TfidfVec wl', 
          'Train TfidfVec porter+wl','Train TfidfVec snowball+wl',
          'Test CountVec', 'Test CountVec porter', 'Test CountVec snowball', 'Test CountVec wl', 
          'Test CountVec porter+wl','Test CountVec snowball+wl',
          'Test TfidfVec', 'Test TfidfVec porter', 'Test TfidfVec snowball', 'Test TfidfVec wl', 
          'Test TfidfVec porter+wl','Test TfidfVec snowball+wl',]

d = {'Data':labels,
     'ACC (%)':[],
     'PRE (%)':[],
     'REC (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}


for clf in pipelines:
    clf.fit(X_train, y_train)

for clf in pipelines:

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train))

for clf in pipelines:

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test))
In [22]:
pd.set_option('precision', 2)

df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform = df_perform*100
df_perform = np.round(df_perform, decimals=2)
df_perform
Out[22]:
ACC (%) PRE (%) REC (%) F1 (%) ROC AUC (%)
Train CountVec 99.8 99.8 99.8 99.8 99.8
Train CountVec porter 99.8 99.8 99.8 99.8 99.8
Train CountVec snowball 99.8 99.8 99.8 99.8 99.8
Train CountVec wl 99.5 100.0 98.9 99.4 99.4
Train CountVec porter+wl 99.5 99.8 99.1 99.4 99.5
Train CountVec snowball+wl 99.5 99.8 99.1 99.4 99.5
Train TfidfVec 99.8 99.5 100.0 99.8 99.8
Train TfidfVec porter 99.8 100.0 99.5 99.8 99.8
Train TfidfVec snowball 99.8 99.8 99.8 99.8 99.8
Train TfidfVec wl 99.4 99.5 99.1 99.3 99.4
Train TfidfVec porter+wl 99.4 99.5 99.1 99.3 99.4
Train TfidfVec snowball+wl 99.4 100.0 98.7 99.3 99.3
Test CountVec 61.5 70.6 45.7 55.5 62.3
Test CountVec porter 61.5 72.6 42.9 53.9 62.5
Test CountVec snowball 68.0 80.6 51.4 62.8 68.9
Test CountVec wl 61.0 70.2 44.8 54.6 61.9
Test CountVec porter+wl 62.5 70.3 49.5 58.1 63.2
Test CountVec snowball+wl 62.0 69.3 49.5 57.8 62.7
Test TfidfVec 60.5 71.0 41.9 52.7 61.5
Test TfidfVec porter 63.5 77.6 42.9 55.2 64.6
Test TfidfVec snowball 61.5 74.1 41.0 52.8 62.6
Test TfidfVec wl 56.5 62.5 42.9 50.9 57.2
Test TfidfVec porter+wl 61.0 68.5 47.6 56.2 61.7
Test TfidfVec snowball+wl 63.5 69.0 55.2 61.4 63.9
In [23]:
df_perform.to_csv('./random_forests_data/clf_performance.csv', index_label=False, float_format='%2.2f')