Compute graph measures corresponding to social networks in the movies
import networkx as nx
from matplotlib import pyplot as plt
import numpy.linalg
import pymongo as pm
import pandas as pd
from community import community_louvain as community
import numpy as np
import seaborn as sns
plt.style.use(['dark_background', 'seaborn-talk'])
PATH = './gexf/'
Read ID-TITLE list
fname = "movies-unique.txt"
with open(fname) as f:
content = f.readlines()
content = [x.strip() for x in content]
import re
movies = {}
for movie in content:
id = re.search('(^\d+)', movie)
title = re.search('(-.*)', movie)
movies[int(id.group(0))] = title.group(0)[1:].replace('-', ' ')
# fix some non-ASCII characters, numbers, and alternative movie titles
movies[38] = 'Alien 3'
movies[873] = 'WALL E'
movies[100] = 'Bad Lieutenant: Port of Call New Orleans'
movies[3] = '12 and Holding'
movies[453] = 'Jennifer 8'
movies[575] = 'Mrs Brown'
movies[647] = 'Postino'
movies[735] = 'Shivers'
del movies[885] # the movie "White Jazz" has never been filmed
MongoDB connector
client = pm.MongoClient()
db = client.moviegalaxies
movies_collection = db.movies
Retrieve genres for each movie
# genres of the movies MG_ID-GENRE
genres = {}
no_genre = []
for movie in movies_collection.find():
if not movie['Genre'] == 'N/A':
genres[movie['mg_id']] = movie['Genre'].split(', ')
else:
no_genre.append(movie['mg_id'])
print(no_genre, "do(es) not have genre")
[906] do(es) not have genre
Assign labels to each genre. Keep only popular genres
genres_df = pd.DataFrame(columns=['mg_id', 'genres', 'target'])
main_genres = ['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']
main_genres_dict = {main_genres[i]: i for i in range(len(main_genres))}
main_genres_dict
{'Action': 0, 'Adventure': 5, 'Biography': 4, 'Comedy': 1, 'Crime': 3, 'Drama': 2, 'Horror': 6}
class_genre_dict = dict (zip(main_genres_dict.values(), main_genres_dict.keys()))
l = 0
for movie_id, g in genres.items():
first_genre = g[0]
if first_genre in main_genres:
genres_df.loc[l] = [movie_id, first_genre, main_genres_dict[first_genre]]
l += 1
genres_df[:10]
mg_id | genres | target | |
---|---|---|---|
0 | 101 | Comedy | 1 |
1 | 10 | Adventure | 5 |
2 | 102 | Crime | 3 |
3 | 103 | Comedy | 1 |
4 | 104 | Adventure | 5 |
5 | 106 | Action | 0 |
6 | 107 | Crime | 3 |
7 | 108 | Biography | 4 |
8 | 109 | Action | 0 |
9 | 110 | Action | 0 |
genres_df.shape
(731, 3)
genres_df.groupby('genres').count()
mg_id | target | |
---|---|---|
genres | ||
Action | 200 | 200 |
Adventure | 41 | 41 |
Biography | 45 | 45 |
Comedy | 171 | 171 |
Crime | 82 | 82 |
Drama | 151 | 151 |
Horror | 41 | 41 |
full_genres_df = pd.DataFrame(columns=['mg_id', 'genres_list', 'target'])
l = 0
for movie_id, g in genres.items():
first_genre = g[0]
if first_genre in main_genres:
full_genres_df.loc[l] = [movie_id, g, main_genres_dict[first_genre]]
l += 1
full_genres_df.head()
mg_id | genres_list | target | |
---|---|---|---|
0 | 101 | [Comedy, Crime, Drama] | 1 |
1 | 10 | [Adventure, Sci-Fi] | 5 |
2 | 102 | [Crime, Drama] | 3 |
3 | 103 | [Comedy, Drama, Music] | 1 |
4 | 104 | [Adventure, Drama, History] | 5 |
Compute network measures
def compute_eigenvalues(g):
"""
Compute eigenvalues of a graph g.
Return: eigenvalues of a graph g.
"""
g = g.to_undirected()
L = nx.normalized_laplacian_matrix(g)
e = numpy.linalg.eigvals(L.A)
return e
general_measures_df = pd.DataFrame(columns=[
'mg_id',
'clustering',
'assortativity',
'transitivity',
'modularity',
'deg_mean',
'deg_std',
'nodes',
'edges',
'shortest'])
loc = 0
for movie_id, title in movies.items():
# print(movie_id)
if not movie_id == 2: # graph '2' is incomplete
g = nx.read_gexf(PATH + str(movie_id) + '.gexf')
# compute some measures and store them as a pandas dataframe
# radius = nx.radius(g)
clustering = nx.average_clustering(g)
# the tendency for vertices in networks to be connected to other vertices that are like (or unlike) them in some way.
# https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.assortativity.html
# https://arxiv.org/pdf/cond-mat/0209450.pdf
# nx.degree_assortativity_coefficient(g)
assortativity = nx.degree_pearson_correlation_coefficient(g)
transitivity = nx.transitivity(g)
# estrada = nx.estrada_index(g)
ev = compute_eigenvalues(g)
# print(ev)
degrees = [d for n, d in g.degree()]
part = community.best_partition(g)
modularity = community.modularity(part, g)
shortest_path = nx.average_shortest_path_length(g)
general_measures_df.loc[loc] = [
movie_id,
clustering,
assortativity,
transitivity,
modularity,
np.mean(degrees),
np.std(degrees),
len(g.nodes),
len(g.edges),
shortest_path
]
loc += 1
# if loc == 10:
# break
/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/scipy/stats/stats.py:3003: RuntimeWarning: invalid value encountered in double_scalars r = r_num / r_den
general_measures_df.head()
mg_id | clustering | assortativity | transitivity | modularity | deg_mean | deg_std | nodes | edges | shortest | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 100.0 | 0.805678 | -0.342569 | 0.280090 | 0.446111 | 4.307692 | 5.598323 | 39.0 | 84.0 | 1.929825 |
1 | 101.0 | 0.677572 | -0.385180 | 0.326568 | 0.222181 | 4.714286 | 4.604789 | 28.0 | 66.0 | 1.976190 |
2 | 10.0 | 0.879083 | -0.196563 | 0.645309 | 0.462104 | 7.555556 | 3.899984 | 27.0 | 102.0 | 1.794872 |
3 | 102.0 | 0.640278 | -0.534301 | 0.222222 | 0.149324 | 3.111111 | 3.381139 | 18.0 | 28.0 | 1.888889 |
4 | 103.0 | 0.701352 | -0.311690 | 0.470588 | 0.435228 | 7.071429 | 7.566036 | 56.0 | 198.0 | 2.096753 |
general_measures_df.shape
(772, 10)
Add 'genres' and 'target' columns. Only popular genres are kept
general_measures_df = pd.merge(genres_df, general_measures_df, on='mg_id').dropna()
general_measures_df.shape
(728, 12)
general_measures_df.head()
mg_id | genres | target | clustering | assortativity | transitivity | modularity | deg_mean | deg_std | nodes | edges | shortest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 101 | Comedy | 1 | 0.677572 | -0.385180 | 0.326568 | 0.222181 | 4.714286 | 4.604789 | 28.0 | 66.0 | 1.976190 |
1 | 10 | Adventure | 5 | 0.879083 | -0.196563 | 0.645309 | 0.462104 | 7.555556 | 3.899984 | 27.0 | 102.0 | 1.794872 |
2 | 102 | Crime | 3 | 0.640278 | -0.534301 | 0.222222 | 0.149324 | 3.111111 | 3.381139 | 18.0 | 28.0 | 1.888889 |
3 | 103 | Comedy | 1 | 0.701352 | -0.311690 | 0.470588 | 0.435228 | 7.071429 | 7.566036 | 56.0 | 198.0 | 2.096753 |
4 | 104 | Adventure | 5 | 0.637863 | -0.382850 | 0.197007 | 0.335873 | 4.042553 | 6.236330 | 47.0 | 95.0 | 1.980574 |
# general_measures_df.hist()
# plt.show()
Normalize
# mean normalization
def mean_norm(df, columns):
result = df.copy()
for feature_name in columns:
mean_value = df[feature_name].mean()
std_value = df[feature_name].std()
result[feature_name] = (df[feature_name] - mean_value) / std_value
return result
# min-max normalization
def min_max_norm(df, columns):
result = df.copy()
for feature_name in columns:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
# z-score normalization
from scipy import stats
def zscore_norm(df, columns):
result = df.dropna().copy()
for feature_name in columns:
result[feature_name] = stats.zscore(df[feature_name])
return result
# normalize
features = ['clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges', 'shortest']
# general_measures_df = mean_norm(general_measures_df, features)
general_measures_df = min_max_norm(general_measures_df, features)
general_measures_df.head()
mg_id | genres | target | clustering | assortativity | transitivity | modularity | deg_mean | deg_std | nodes | edges | shortest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 101 | Comedy | 1 | 0.594543 | 0.414510 | 0.275433 | 0.321249 | 0.162933 | 0.262021 | 0.198020 | 0.090468 | 0.244690 |
1 | 10 | Adventure | 5 | 0.944766 | 0.633743 | 0.689716 | 0.671798 | 0.348697 | 0.202718 | 0.188119 | 0.148627 | 0.175675 |
2 | 102 | Crime | 3 | 0.529726 | 0.241184 | 0.139809 | 0.214798 | 0.058116 | 0.159062 | 0.099010 | 0.029079 | 0.211461 |
3 | 103 | Comedy | 1 | 0.635872 | 0.499929 | 0.462623 | 0.632531 | 0.317045 | 0.511183 | 0.475248 | 0.303716 | 0.290580 |
4 | 104 | Adventure | 5 | 0.525530 | 0.417218 | 0.107036 | 0.487364 | 0.119015 | 0.399300 | 0.386139 | 0.137318 | 0.246358 |
general_measures_df.shape
(728, 12)
Remove outliers (keep only those within -5 and +5 standard deviations)
for feature in features:
general_measures_df = general_measures_df[np.abs(general_measures_df[feature] - general_measures_df[feature].mean()) <= (5 * general_measures_df[feature].std())]
general_measures_df.shape
(721, 12)
# general_measures_df.to_csv('mg-features.csv', index=False, float_format="%.2f")
Feature selection
main_genres
['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']
genres_list = ['Biography', 'Adventure', 'Horror']
subset_with_genres = general_measures_df.loc[general_measures_df['genres'].isin(genres_list)]
features_list = ['genres', 'clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges']
viz_df = subset_with_genres[features_list]
viz_df.groupby(['genres']).size()
genres Adventure 41 Biography 44 Horror 40 dtype: int64
g = sns.pairplot(viz_df, hue='genres', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)
# g = sns.PairGrid(viz_df, hue='genres', hue_kws={"cmap": ["Blues", "Reds"]})
# g = g.map_offdiag(sns.kdeplot, lw=3)
# g = g.map_diag(sns.kdeplot, lw=1)
<seaborn.axisgrid.PairGrid at 0x7fa6f8abb198>
Prepare dataset
features = features_list[1:]
data = subset_with_genres[features].as_matrix()
target = subset_with_genres['target'].as_matrix()
data.shape[0] == target.shape[0]
True
len(features)
8
# Adventure biography
# modularity x deg_std
# modularity x nodes
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(degree=3, interaction_only=True)
# poly_df = pd.DataFrame(poly.fit_transform(general_measures_df[features_list[1:]]), columns=poly.get_feature_names())
# poly_df.shape
# viz_df = poly_df.iloc[:,43:53].join(general_measures_df['genres'])
# viz_df = viz_df.loc[viz_df['genres'].isin(genres_list)]
# viz_df.groupby('genres').size()
# g = sns.pairplot(viz_df, hue='genres')
# from sklearn import preprocessing
# quantile_transformer = preprocessing.QuantileTransformer(random_state=0, output_distribution='uniform')
# data_trans = pd.DataFrame(quantile_transformer.fit_transform(general_measures_df[features_list[1:]]))
# data_trans = data_trans.join(general_measures_df['genres'])
# data_trans = data_trans.loc[data_trans['genres'].isin(genres_list)]
# sns.pairplot(data_trans, hue='genres')
from sklearn import manifold
from matplotlib.ticker import NullFormatter
n_components = 2
n_neighbors = 10
dimX = 0
dimY = 1
# # adopted code from this source: http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html#sphx-glr-auto-examples-manifold-plot-compare-methods-py
fig = plt.figure(figsize=(15, 8))
methods = ['standard', 'ltsa', 'hessian', 'modified']
labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']
# for i, method in enumerate(methods):
# data_lle_trans = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
# eigen_solver='auto',
# method=method).fit_transform(data)
# ax = fig.add_subplot(251 + i)
# plt.scatter(data_lle_trans[:, dimX], data_lle_trans[:, dimY], c=target)
# plt.title("%s" % labels[i])
# # ax.xaxis.set_major_formatter(NullFormatter())
# # ax.yaxis.set_major_formatter(NullFormatter())
# plt.axis('tight')
ax = fig.add_subplot(256)
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
data_tsne_trans = tsne.fit_transform(data)
plt.scatter(data_tsne_trans[:, dimX], data_tsne_trans[:, dimY], c=target)
plt.title("t-SNE")
ax = fig.add_subplot(257)
mds = manifold.MDS(n_components, max_iter=100, n_init=1)
data_mds_trans = mds.fit_transform(data)
plt.scatter(data_mds_trans[:, dimX], data_mds_trans[:, dimY], c=target)
plt.title("MDS")
ax = fig.add_subplot(258)
se = manifold.SpectralEmbedding(n_components=n_components,
n_neighbors=n_neighbors)
data_se_trans = se.fit_transform(data)
plt.scatter(data_se_trans[:, dimX], data_se_trans[:, dimY], c=target)
plt.title("Spectral Embedding")
ax = fig.add_subplot(259)
isomap = manifold.Isomap(n_neighbors, n_components)
data_isomap_trans = isomap.fit_transform(data)
plt.scatter(data_isomap_trans[:, dimX], data_isomap_trans[:, dimY], c=target)
plt.title("Isomap")
Text(0.5,1,'Isomap')
Add manifold features to the dataset
manifold_features = ['manifoldX', 'manifoldY']
manifold_data = data_mds_trans
data_man = np.concatenate((data, manifold_data), axis=1)
features_man = np.concatenate((features, manifold_features))
manifold_df = pd.DataFrame(data_man, columns=features_man, dtype='float')
# normalize
manifold_df = mean_norm(manifold_df, manifold_features)
manifold_df['target'] = target.astype(np.float)
g = sns.pairplot(manifold_df, hue='target', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)
<seaborn.axisgrid.PairGrid at 0x7fa6e607df60>
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(n_estimators=100)
param_grid = {"max_depth": [3, None],
"max_features": range(1,len(features)),
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
grid = GridSearchCV(rfc, param_grid=param_grid)
grid.fit(list(data), list(target))
GridSearchCV(cv=None, error_score='raise', estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False), fit_params=None, iid=True, n_jobs=1, param_grid={'max_depth': [3, None], 'max_features': range(1, 8), 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0)
grid.best_score_
0.592
grid.best_params_
{'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 10, 'min_samples_split': 10}
rfc = grid.best_estimator_
rfc.fit(list(data), list(target))
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=10, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
print('Importance of features')
print(sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), features), reverse=True))
Importance of features [(0.3857, 'nodes'), (0.1488, 'deg_std'), (0.1339, 'transitivity'), (0.1213, 'edges'), (0.0628, 'assortativity'), (0.0624, 'deg_mean'), (0.0448, 'clustering'), (0.0403, 'modularity')]
# export_graphviz(rfc.estimators_[1], feature_names=features, class_names=genres_list, filled=True, rounded=True, out_file='tree.dot')
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
param_grid = {"max_depth": [3, None],
"max_features": range(1,len(features)),
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"criterion": ["gini", "entropy"]}
grid = GridSearchCV(dtc, param_grid)
grid.fit(list(data), list(target))
GridSearchCV(cv=None, error_score='raise', estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best'), fit_params=None, iid=True, n_jobs=1, param_grid={'max_depth': [3, None], 'max_features': range(1, 8), 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'criterion': ['gini', 'entropy']}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0)
grid.best_score_
0.576
grid.best_params_
{'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 10, 'min_samples_split': 10}
dtc = grid.best_estimator_
dtc.fit(list(data), list(target))
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None, max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=10, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
export_graphviz(dtc, feature_names=features, class_names=genres_list, filled=True, rounded=True, out_file='tree.dot')
classes = dtc.predict(data)
import json
def rules(clf, features, labels, node_index=0):
"""Structure of rules in a fit decision tree classifier
Parameters
----------
clf : DecisionTreeClassifier
A tree that has already been fit.
features, labels : lists of str
The names of the features and labels, respectively.
"""
node = {}
if clf.tree_.children_left[node_index] == -1: # indicates leaf
count_labels = zip(clf.tree_.value[node_index, 0], labels)
node['name'] = ', '.join(('{} of {}'.format(int(count), label)
for count, label in count_labels))
# node['type']='leaf'
# node['value'] = clf.tree_.value[node_index, 0].tolist()
# node['error'] = np.float64(clf.tree_.impurity[node_index]).item()
# node['samples'] = clf.tree_.n_node_samples[node_index]
else:
feature = features[clf.tree_.feature[node_index]]
threshold = clf.tree_.threshold[node_index]
# node['type']='split'
error = np.float64(clf.tree_.impurity[node_index]).item()
node['name'] = '{0} > {1:.2f}'.format(feature, threshold)
# node['error'] = np.float64(clf.tree_.impurity[node_index]).item()
# node['samples'] = clf.tree_.n_node_samples[node_index]
# node['value'] = clf.tree_.value[node_index, 0].tolist()
left_index = clf.tree_.children_left[node_index]
right_index = clf.tree_.children_right[node_index]
node['children'] = [rules(clf, features, labels, right_index),
rules(clf, features, labels, left_index)]
return node
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return super(MyEncoder, self).default(obj)
# save decision tree as JSON
d = rules(dtc, features, genres_list)
with open('output.json', 'w') as outfile:
json.dump(d, outfile,cls=MyEncoder)
Visualize classification results
viz_df = subset_with_genres[features_list]
viz_df['classes'] = classes
viz_df.groupby('classes').size()
/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
classes 4 45 5 62 6 18 dtype: int64
classes_df = subset_with_genres
classes_df['classes'] = classes
genre_indexes = []
for i in range(len(genres_list)):
genre_indexes.append(main_genres_dict[genres_list[i]])
fig = plt.figure(figsize=(20,10))
fig.suptitle('Classification results. Accuracy: {0:.2f}'.format(grid.best_score_))
for c in genre_indexes:
count_genres = {}
for genre in genres_list:
count_genres[genre] = 0
for genre in list(classes_df.loc[classes_df['classes'] == c]['genres']):
# for g in genre:
# count_genres[g] += 1
# keep only the first genre
count_genres[genre] +=1
# # sort before plotting
import operator
sorted_genres = sorted(count_genres.items(), key=operator.itemgetter(1), reverse=True)
titles = []
counts = []
for g in sorted_genres:
titles.append(g[0])
counts.append(g[1])
ax = fig.add_subplot(2,4,c+1)
# plt.figure(figsize=(5,5))
ax.bar(titles, counts)
ax.set_title(class_genre_dict[c])
# data = []
# for pair in zip(titles, counts):
# data.append("{{\"genre\":\"{}\", \"count\":{}}}".format(pair[0], pair[1]))
# print(data)
/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from sklearn import svm
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
C_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000]
gamma_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000]
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
grid.fit(list(data), list(target))
GridSearchCV(cv=StratifiedShuffleSplit(n_splits=20, random_state=42, test_size=0.2, train_size=None), error_score='raise', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), fit_params=None, iid=True, n_jobs=1, param_grid={'gamma': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000], 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000]}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0)
grid.best_score_
0.562
grid.best_params_
{'C': 10000, 'gamma': 0.001}
params = grid.best_params_
Build a SVM model using the best parameters
svm_model = svm.SVC(kernel='rbf', gamma=params['gamma'], C=params['C'])
svm_model.fit(list(data), list(target))
SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
# classes = svm_model.predict(data)
Look at the secondary genres of misclassified movies
viz_df = viz_df.join(subset_with_genres['mg_id'])
viz_full_genres_df = pd.merge(viz_df, full_genres_df[['mg_id', 'genres_list']], on='mg_id')
c_ids = viz_full_genres_df.classes.unique()
fig = plt.figure(figsize=(30,10))
for c in c_ids:
genres_by_class = viz_full_genres_df.loc[viz_full_genres_df['classes'] == c]['genres_list'].values
genres_by_class = list(filter(lambda l: l[0] != class_genre_dict[c], genres_by_class))
# keep only secondary genres
misclassified_genres = [item for sublist in genres_by_class for item in sublist[1:]]
mg_dict = {}
for genre in list(set(misclassified_genres)):
mg_dict[genre] = 0
for genre in misclassified_genres:
mg_dict[genre] += 1
# # sort before plotting
import operator
sorted_genres = sorted(mg_dict.items(), key=operator.itemgetter(1), reverse=True)
titles = []
counts = []
for g in sorted_genres:
titles.append(g[0])
counts.append(g[1])
ax = fig.add_subplot(2,4,c+1)
# plt.figure(figsize=(5,5))
ax.bar(titles[:5], counts[:5])
ax.set_title('Misclassified secondary genres for class ' + class_genre_dict[c])