In [1]:

%load_ext watermark
%watermark -a 'Sebastian Raschka' -v

Sebastian Raschka 

CPython 3.4.2
IPython 2.3.0

Music Mood Classification Using Random Forests¶

Sections¶

Reading the Training Dataset
Text Preprocessing
- Feature Extraction - Cross Validation Error
- Looking at vocabulary sizes
Model Selection
- Feature Extraction - Cross Validation Error
- ROC Curve
Hyperparameter Tuning
- Effect of the number of estimators
- GridSearch

Reading the Training Dataset¶

[back to top]

In [13]:

import pandas as pd

df_train = pd.read_csv('../../dataset/training/train_lyrics_1000.csv')
df_test = pd.read_csv('../../dataset/validation/valid_lyrics_200.csv')

df_train.tail()

Out[13]:

	file	artist	title	lyrics	mood	year
995	TRBIGRY128F42597B3.h5	Sade	All About Our Love	Its all about our love\nSo shall it be forever...	sad	2000
996	TRBIIEU128F9307C88.h5	New Found Glory	Don't Let Her Pull You Down	It's time that I rain on your parade\nWatch as...	happy	2009
997	TRBIIJY12903CE4755.h5	Mindy McCready	Ten Thousand Angels	Speakin of the devil\nLook who just walked in\...	happy	1996
998	TRBIIOT128F423C594.h5	Joy Division	Leaders Of Men	Born from some mother's womb\nJust like any ot...	sad	1978
999	TRBIJYB128F14AE326.h5	Seventh Day Slumber	Shattered Life	This wanting more from me is tearing me, it's ...	sad	2005

In [14]:

X_train = df_train['lyrics']
y_train = df_train['mood']

X_test = df_test['lyrics']
y_test = df_test['mood']

In [15]:

# Label encoder

import pickle
import numpy as np

pickle_in = open('./label_encoder.p', 'rb')
le = pickle.load(pickle_in)
pickle_in.close()

print('before: %s ...' %y_train[:5])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

print('after: %s ...' %y_train[:5])

before: 0      sad
1    happy
2      sad
3    happy
4      sad
Name: mood, dtype: object ...
after: [1 0 1 0 1] ...

Text Preprocessing¶

[back to top]

In [5]:

import pickle
stop_words = pickle.load(open('./stopwords.p', 'rb'))
semantic_words = pickle.load(open('./whitelist_dicts/semantic_words_py34.p', 'rb'))

Transform texts into bag of words models - Trying different tokenizers¶

[back to top]

In [6]:

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer

porter = PorterStemmer()
snowball = EnglishStemmer()

# raw words
tokenizer = lambda text: text.split()

# words after Porter stemming 
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]

# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]

# Only words that are in a list of 'positive' or 'negative' words ('whitelist')
# http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
tokenizer_whitelist = lambda text: [word for word in text.split() if word in semantic_words]

# Porter-stemmed words in whitelist
tokenizer_porter_wl = lambda text: [porter.stem(word) for word in text.split() if word in semantic_words]

# Snowball-stemmed words in whitelist
tokenizer_snowball_wl = lambda text: [snowball.stem(word) for word in text.split() if word in semantic_words]

Looking at vocabulary sizes¶

[back to top]

In [7]:

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from mlxtend.sklearn import DenseTransformer

vect_1 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_2 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_3 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)  

vect_4 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)  

vect_5 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_6 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)

vect_7 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vect_8 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vect_9 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)

vect_10 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_whitelist)    

vect_11 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter_wl)

vect_12 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball_wl)


pipelines = []
vectorizers = [vect_1, vect_2, vect_3, vect_4, vect_5, vect_6, vect_7, vect_8, vect_9, vect_10, vect_11, vect_12]
for v in vectorizers:
    pipelines.append(Pipeline([('vect', v),
                               ('dense', DenseTransformer()),
                               ('clf', RandomForestClassifier(n_estimators=100))]))

In [17]:

# done before max_features was set

print('Vocabulary sizes\n')
labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
          'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]

for label, v in zip(labels, vectorizers):
    v.fit(X_train)
    print('%s: %s' % (label, len(v.vocabulary_)))

Vocabulary sizes

CountVec: 11378
CountVec porter: 8551
CountVec snowball: 8528
CountVec wl: 1666
CountVec porter+wl: 1349
CountVec snowball+wl: 1332
TfidfVec: 11378
TfidfVec porter: 8551
TfidfVec snowball: 8528
TfidfVec wl: 1666
TfidfVec porter+wl: 1349
TfidfVec snowball+wl: 1332

Model Selection¶

[back to top]

Feature Extraction - Cross Validation Error¶

[back to top]

In [19]:

from sklearn import metrics
from sklearn import cross_validation

labels = ['CountVec', 'CountVec porter', 'CountVec snowball', 'CountVec wl', 'CountVec porter+wl','CountVec snowball+wl',
          'TfidfVec', 'TfidfVec porter', 'TfidfVec snowball', 'TfidfVec wl', 'TfidfVec porter+wl','TfidfVec snowball+wl',]



d = {'Data':labels,
     'ROC AUC (%)':[],}

for i,clf in enumerate(pipelines):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
    print('clf %s, %s: %s' % (i+1, labels[i], scores.mean()*100))
    d['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))

clf 1, CountVec: 72.4267184835
clf 2, CountVec porter: 72.5908238226
clf 3, CountVec snowball: 74.2652990948
clf 4, CountVec wl: 68.4214056146
clf 5, CountVec porter+wl: 68.7880362062
clf 6, CountVec snowball+wl: 69.3732388823
clf 7, TfidfVec: 70.9020529975
clf 8, TfidfVec porter: 73.5260330579
clf 9, TfidfVec snowball: 72.5986685032
clf 10, TfidfVec wl: 66.7347697757
clf 11, TfidfVec porter+wl: 69.1317821068
clf 12, TfidfVec snowball+wl: 68.3577298964

In [20]:

df_perform = pd.DataFrame(d)
df_perform = df_perform['ROC AUC (%)']
df_perform.index=(labels)
df_perform

Out[20]:

CountVec                72.43 (+/- 4.18)
CountVec porter         72.59 (+/- 6.05)
CountVec snowball       74.27 (+/- 5.00)
CountVec wl             68.42 (+/- 4.92)
CountVec porter+wl      68.79 (+/- 4.48)
CountVec snowball+wl    69.37 (+/- 4.69)
TfidfVec                70.90 (+/- 4.03)
TfidfVec porter         73.53 (+/- 4.21)
TfidfVec snowball       72.60 (+/- 4.61)
TfidfVec wl             66.73 (+/- 5.91)
TfidfVec porter+wl      69.13 (+/- 5.08)
TfidfVec snowball+wl    68.36 (+/- 6.03)
Name: ROC AUC (%), dtype: object

In [21]:

df_perform.to_csv('./random_forests_data/rand_forest_featextr_1.csv', index=False)

ROC Curve¶

[back to top]

In [22]:

%matplotlib inline

In [24]:

from sklearn.metrics import roc_curve, auc
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

classifier = Pipeline([('vect',   TfidfVectorizer(binary=False,
                                             stop_words=stop_words,
                                             ngram_range=(1,1),
                                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                                             max_features = 5000,
                                             tokenizer=lambda text: [porter.stem(word) for word in text.split()]
                )),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])


cv = KFold(y_train.shape[0], n_folds=10, random_state=123)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.tight_layout()
plt.savefig('./random_forests_images/roc_tfidf_porter_1.eps', dpi=300)
plt.legend(loc="lower right")

plt.show()

Hyperparameter Tuning¶

[back to top]

In [8]:

vect = TfidfVectorizer(binary=False,
                       stop_words=stop_words,
                       ngram_range=(1,1),
                       preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                       tokenizer=lambda text: [porter.stem(word) for word in text.split()])

Effect of the number of estimators¶

[back to top]

In [10]:

from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation

pipe_1 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=50))])

pipe_2 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])

pipe_3 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=200))])

pipe_4 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=400))])

labels = [50, 100, 200, 400]

for i,clf in enumerate([pipe_1, pipe_2, pipe_3, pipe_4]):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
    print('clf %s, %s: %0.2f (+/- %0.2f)' % (i+1, labels[i], scores.mean()*100, scores.std()*100))

clf 1, 50: 71.31 (+/- 6.53)
clf 2, 100: 71.62 (+/- 4.76)
clf 3, 200: 72.51 (+/- 3.81)
clf 4, 400: 74.77 (+/- 3.80)

GridSearch¶

[back to top]

In [12]:

X_train_feat = vect.fit_transform(X_train, y_train)
X_train_feat = X_train_feat.toarray()

In [ ]:

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report


clf_2 = RandomForestClassifier(n_estimators=50)


tuned_parameters = [
  {'criterion': ['gini', 'entropy'], 
   'max_features': ['auto', 'log2', 'sqrt'],
   'min_samples_split':[2,3], 
   'min_samples_leaf':[1,2]},
 ]


grid_search_1 = GridSearchCV(clf_2, 
                           tuned_parameters, 
                           n_jobs=1, 
                           scoring='roc_auc',
                           cv=10
                )

grid_search_1.fit(X_train_feat, X_train_feat)

print("Best parameters set found on development set:")
print()
print(grid_search_1.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in grid_search_1.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
            % (mean_score, scores.std() / 2, params))

Validation¶

In [16]:

# Custom scorer methods to account for positive-negative class labels

from sklearn import metrics

# `pos_label` for positive class, since we have sad=1, happy=0

acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)
auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True)

In [21]:

labels = ['Train CountVec', 'Train CountVec porter', 'Train CountVec snowball', 'Train CountVec wl', 
          'Train CountVec porter+wl','Train CountVec snowball+wl',
          'Train TfidfVec', 'Train TfidfVec porter', 'Train TfidfVec snowball', 'Train TfidfVec wl', 
          'Train TfidfVec porter+wl','Train TfidfVec snowball+wl',
          'Test CountVec', 'Test CountVec porter', 'Test CountVec snowball', 'Test CountVec wl', 
          'Test CountVec porter+wl','Test CountVec snowball+wl',
          'Test TfidfVec', 'Test TfidfVec porter', 'Test TfidfVec snowball', 'Test TfidfVec wl', 
          'Test TfidfVec porter+wl','Test TfidfVec snowball+wl',]

d = {'Data':labels,
     'ACC (%)':[],
     'PRE (%)':[],
     'REC (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}


for clf in pipelines:
    clf.fit(X_train, y_train)

for clf in pipelines:

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train))

for clf in pipelines:

    d['ACC (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['PRE (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['REC (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test))
    d['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test))

In [22]:

pd.set_option('precision', 2)

df_perform = pd.DataFrame(d)
df_perform = df_perform[['ACC (%)', 'PRE (%)', 'REC (%)', 'F1 (%)', 'ROC AUC (%)']]
df_perform.index=(labels)
df_perform = df_perform*100
df_perform = np.round(df_perform, decimals=2)
df_perform

Out[22]:

	ACC (%)	PRE (%)	REC (%)	F1 (%)	ROC AUC (%)
Train CountVec	99.8	99.8	99.8	99.8	99.8
Train CountVec porter	99.8	99.8	99.8	99.8	99.8
Train CountVec snowball	99.8	99.8	99.8	99.8	99.8
Train CountVec wl	99.5	100.0	98.9	99.4	99.4
Train CountVec porter+wl	99.5	99.8	99.1	99.4	99.5
Train CountVec snowball+wl	99.5	99.8	99.1	99.4	99.5
Train TfidfVec	99.8	99.5	100.0	99.8	99.8
Train TfidfVec porter	99.8	100.0	99.5	99.8	99.8
Train TfidfVec snowball	99.8	99.8	99.8	99.8	99.8
Train TfidfVec wl	99.4	99.5	99.1	99.3	99.4
Train TfidfVec porter+wl	99.4	99.5	99.1	99.3	99.4
Train TfidfVec snowball+wl	99.4	100.0	98.7	99.3	99.3
Test CountVec	61.5	70.6	45.7	55.5	62.3
Test CountVec porter	61.5	72.6	42.9	53.9	62.5
Test CountVec snowball	68.0	80.6	51.4	62.8	68.9
Test CountVec wl	61.0	70.2	44.8	54.6	61.9
Test CountVec porter+wl	62.5	70.3	49.5	58.1	63.2
Test CountVec snowball+wl	62.0	69.3	49.5	57.8	62.7
Test TfidfVec	60.5	71.0	41.9	52.7	61.5
Test TfidfVec porter	63.5	77.6	42.9	55.2	64.6
Test TfidfVec snowball	61.5	74.1	41.0	52.8	62.6
Test TfidfVec wl	56.5	62.5	42.9	50.9	57.2
Test TfidfVec porter+wl	61.0	68.5	47.6	56.2	61.7
Test TfidfVec snowball+wl	63.5	69.0	55.2	61.4	63.9

In [23]:

df_perform.to_csv('./random_forests_data/clf_performance.csv', index_label=False, float_format='%2.2f')

In [ ]: