In [1]:

import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

Read Data¶

In [2]:

path = ''

In [3]:

dataTraining = pd.read_csv(os.path.join(path, 'data', 'dataTraining.csv'), encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv(os.path.join(path, 'data', 'dataTesting.csv'), encoding='UTF-8', index_col=0)

In [4]:

dataTesting.head()

Out[4]:

	year	title	plot
1	1999	Message in a Bottle	who meets by fate , shall be sealed by fate ....
4	1978	Midnight Express	the true story of billy hayes , an american c...
5	1996	Primal Fear	martin vail left the chicago da ' s office to ...
6	1950	Crisis	husband and wife americans dr . eugene and mr...
7	1959	The Tingler	the coroner and scientist dr . warren chapin ...

In [5]:

dataTesting.head()

Out[5]:

	year	title	plot
1	1999	Message in a Bottle	who meets by fate , shall be sealed by fate ....
4	1978	Midnight Express	the true story of billy hayes , an american c...
5	1996	Primal Fear	martin vail left the chicago da ' s office to ...
6	1950	Crisis	husband and wife americans dr . eugene and mr...
7	1959	The Tingler	the coroner and scientist dr . warren chapin ...

Create count vectorizer with ngrams¶

In [6]:

vect = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape

Out[6]:

(7895, 1000)

In [7]:

print(vect.get_feature_names()[:50])

['able', 'able to', 'about', 'about the', 'about to', 'accident', 'across', 'act', 'action', 'actually', 'affair', 'after', 'after the', 'again', 'against', 'against the', 'age', 'agent', 'ago', 'alex', 'alive', 'all', 'all of', 'all the', 'alone', 'along', 'along the', 'along with', 'already', 'also', 'although', 'always', 'america', 'american', 'among', 'an', 'an old', 'and', 'and has', 'and he', 'and her', 'and his', 'and is', 'and she', 'and that', 'and the', 'and their', 'and then', 'and they', 'and when']

Create y¶

In [8]:

dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))

le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])

Split train and test¶

In [10]:

X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.33, random_state=42)

Train multi-class multi-label model¶

In [11]:

clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=10, random_state=42))

In [12]:

clf.fit(X_train, y_train_genres)

Out[12]:

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          n_jobs=1)

In [13]:

y_pred_genres = clf.predict_proba(X_test)

In [14]:

roc_auc_score(y_test_genres, y_pred_genres, average='macro')

Out[14]:

0.7437754044790014

Apply models to kaggle test¶

In [15]:

X_test_dtm = vect.transform(dataTesting['plot'])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

y_pred_test_genres = clf.predict_proba(X_test_dtm)

pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols).to_csv('pred_genres_text_RF.csv', index_label='ID')

In [ ]: