Classify a movie genre based on its plot.
https://www.kaggle.com/c/miia4200-20191-p2-moviegenreclassification/overview
Input:
Output: Probability of the movie belong to each genre
We thank Professor Fabio Gonzalez, Ph.D. and his student John Arevalo for providing this dataset.
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
dataTraining = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)
dataTraining.head()
year | title | plot | genres | rating | |
---|---|---|---|---|---|
3107 | 2003 | Most | most is the story of a single father who takes... | [Short, Drama] | 8.0 |
900 | 2008 | How to Be a Serial Killer | a serial killer decides to teach the secrets o... | [Comedy, Crime, Horror] | 5.6 |
6724 | 1941 | A Woman's Face | in sweden , a female blackmailer with a disfi... | [Drama, Film-Noir, Thriller] | 7.2 |
4704 | 1954 | Executive Suite | in a friday afternoon in new york , the presi... | [Drama] | 7.4 |
2582 | 1990 | Narrow Margin | in los angeles , the editor of a publishing h... | [Action, Crime, Thriller] | 6.6 |
dataTesting.head()
year | title | plot | |
---|---|---|---|
1 | 1999 | Message in a Bottle | who meets by fate , shall be sealed by fate .... |
4 | 1978 | Midnight Express | the true story of billy hayes , an american c... |
5 | 1996 | Primal Fear | martin vail left the chicago da ' s office to ... |
6 | 1950 | Crisis | husband and wife americans dr . eugene and mr... |
7 | 1959 | The Tingler | the coroner and scientist dr . warren chapin ... |
vect = CountVectorizer(max_features=1000)
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape
(7895, 1000)
print(vect.get_feature_names()[:50])
['able', 'about', 'accepts', 'accident', 'accidentally', 'across', 'act', 'action', 'actor', 'actress', 'actually', 'adam', 'adult', 'adventure', 'affair', 'after', 'again', 'against', 'age', 'agent', 'agents', 'ago', 'agrees', 'air', 'alan', 'alex', 'alice', 'alien', 'alive', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'america', 'american', 'among', 'an', 'and', 'angeles', 'ann', 'anna', 'another', 'any', 'anyone', 'anything', 'apartment']
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))
le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])
y_genres
array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 1, 0, 0], ..., [0, 1, 0, ..., 0, 0, 0], [0, 1, 1, ..., 0, 0, 0], [0, 1, 1, ..., 0, 0, 0]])
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.33, random_state=42)
clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=10, random_state=42))
clf.fit(X_train, y_train_genres)
OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=10, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False), n_jobs=None)
y_pred_genres = clf.predict_proba(X_test)
roc_auc_score(y_test_genres, y_pred_genres, average='macro')
0.7812262183677007
X_test_dtm = vect.transform(dataTesting['plot'])
cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']
y_pred_test_genres = clf.predict_proba(X_test_dtm)
res = pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols)
res.head()
p_Action | p_Adventure | p_Animation | p_Biography | p_Comedy | p_Crime | p_Documentary | p_Drama | p_Family | p_Fantasy | ... | p_Musical | p_Mystery | p_News | p_Romance | p_Sci-Fi | p_Short | p_Sport | p_Thriller | p_War | p_Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.143030 | 0.101960 | 0.024454 | 0.029938 | 0.354552 | 0.138830 | 0.030787 | 0.490140 | 0.073159 | 0.101339 | ... | 0.025069 | 0.063208 | 0.000000 | 0.362818 | 0.056648 | 0.008970 | 0.017522 | 0.202605 | 0.033989 | 0.018117 |
4 | 0.122624 | 0.085786 | 0.024213 | 0.084795 | 0.370949 | 0.216657 | 0.080359 | 0.515684 | 0.062976 | 0.067019 | ... | 0.024734 | 0.060935 | 0.000477 | 0.149703 | 0.058190 | 0.014248 | 0.020099 | 0.204794 | 0.030438 | 0.018506 |
5 | 0.151364 | 0.110284 | 0.013762 | 0.075334 | 0.304837 | 0.448736 | 0.021010 | 0.611544 | 0.081741 | 0.169121 | ... | 0.044538 | 0.261372 | 0.000000 | 0.335987 | 0.128505 | 0.001016 | 0.048658 | 0.423242 | 0.052693 | 0.025351 |
6 | 0.154448 | 0.125772 | 0.020991 | 0.064124 | 0.340779 | 0.140892 | 0.009133 | 0.632038 | 0.068287 | 0.063631 | ... | 0.131074 | 0.088418 | 0.000000 | 0.197224 | 0.132208 | 0.001432 | 0.039743 | 0.269385 | 0.077607 | 0.017862 |
7 | 0.175143 | 0.210069 | 0.035476 | 0.032505 | 0.313850 | 0.243150 | 0.021793 | 0.427885 | 0.079781 | 0.143879 | ... | 0.023859 | 0.090359 | 0.000048 | 0.205117 | 0.241663 | 0.002634 | 0.018403 | 0.259465 | 0.021569 | 0.017585 |
5 rows × 24 columns
res.to_csv('pred_genres_text_RF.csv', index_label='ID')