Graph measures¶

Compute graph measures corresponding to social networks in the movies

In [1]:

import networkx as nx
from matplotlib import pyplot as plt
import numpy.linalg
import pymongo as pm
import pandas as pd
from community import community_louvain as community
import numpy as np
import seaborn as sns

In [2]:

plt.style.use(['dark_background', 'seaborn-talk'])

In [3]:

PATH = './gexf/'

Read ID-TITLE list

In [4]:

fname = "movies-unique.txt"

with open(fname) as f:
    content = f.readlines()
content = [x.strip() for x in content]

import re
movies = {}
for movie in content:
    id = re.search('(^\d+)', movie)
    title = re.search('(-.*)', movie)
    movies[int(id.group(0))] = title.group(0)[1:].replace('-', ' ')

# fix some non-ASCII characters, numbers, and alternative movie titles
movies[38] = 'Alien 3'
movies[873] = 'WALL E'
movies[100] = 'Bad Lieutenant: Port of Call New Orleans'
movies[3] = '12 and Holding'
movies[453] = 'Jennifer 8'
movies[575] = 'Mrs Brown'
movies[647] = 'Postino'
movies[735] = 'Shivers'
del movies[885] # the movie "White Jazz" has never been filmed

MongoDB connector

In [5]:

client = pm.MongoClient()
db = client.moviegalaxies
movies_collection = db.movies

Retrieve genres for each movie

In [6]:

# genres of the movies MG_ID-GENRE
genres = {}
no_genre = []
for movie in movies_collection.find():
    if not movie['Genre'] == 'N/A':
        genres[movie['mg_id']] = movie['Genre'].split(', ')
    else:
        no_genre.append(movie['mg_id'])
print(no_genre, "do(es) not have genre")

[906] do(es) not have genre

Assign labels to each genre. Keep only popular genres

In [7]:

genres_df = pd.DataFrame(columns=['mg_id', 'genres', 'target'])

In [8]:

main_genres = ['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']

In [9]:

main_genres_dict = {main_genres[i]: i for i in range(len(main_genres))}

In [10]:

main_genres_dict

Out[10]:

{'Action': 0,
 'Adventure': 5,
 'Biography': 4,
 'Comedy': 1,
 'Crime': 3,
 'Drama': 2,
 'Horror': 6}

In [11]:

class_genre_dict = dict (zip(main_genres_dict.values(), main_genres_dict.keys()))

In [12]:

l = 0
for movie_id, g in genres.items():
    first_genre = g[0]
    if first_genre in main_genres:
        genres_df.loc[l] = [movie_id, first_genre, main_genres_dict[first_genre]]
        l += 1

In [13]:

genres_df[:10]

Out[13]:

	mg_id	genres	target
0	101	Comedy	1
1	10	Adventure	5
2	102	Crime	3
3	103	Comedy	1
4	104	Adventure	5
5	106	Action	0
6	107	Crime	3
7	108	Biography	4
8	109	Action	0
9	110	Action	0

In [14]:

genres_df.shape

Out[14]:

(731, 3)

In [15]:

genres_df.groupby('genres').count()

Out[15]:

	mg_id	target
genres
Action	200	200
Adventure	41	41
Biography	45	45
Comedy	171	171
Crime	82	82
Drama	151	151
Horror	41	41

In [16]:

full_genres_df = pd.DataFrame(columns=['mg_id', 'genres_list', 'target'])

In [17]:

l = 0
for movie_id, g in genres.items():
    first_genre = g[0]
    if first_genre in main_genres:
        full_genres_df.loc[l] = [movie_id, g, main_genres_dict[first_genre]]
        l += 1

In [18]:

full_genres_df.head()

Out[18]:

	mg_id	genres_list	target
0	101	[Comedy, Crime, Drama]	1
1	10	[Adventure, Sci-Fi]	5
2	102	[Crime, Drama]	3
3	103	[Comedy, Drama, Music]	1
4	104	[Adventure, Drama, History]	5

Compute network measures

In [19]:

def compute_eigenvalues(g):
    """
    Compute eigenvalues of a graph g.
    Return: eigenvalues of a graph g.
    """
    g = g.to_undirected()
    L = nx.normalized_laplacian_matrix(g)
    e = numpy.linalg.eigvals(L.A)
    return e

In [20]:

general_measures_df = pd.DataFrame(columns=[
    'mg_id', 
    'clustering', 
    'assortativity', 
    'transitivity', 
    'modularity', 
    'deg_mean', 
    'deg_std',
    'nodes',
    'edges',
    'shortest'])

In [21]:

loc = 0
for movie_id, title in movies.items():
#     print(movie_id)
    if not movie_id == 2: # graph '2' is incomplete
        g = nx.read_gexf(PATH + str(movie_id) + '.gexf')
        # compute some measures and store them as a pandas dataframe
#         radius = nx.radius(g)
        clustering = nx.average_clustering(g)
        # the tendency for vertices in networks to be connected to other vertices that are like (or unlike) them in some way.
        # https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.assortativity.html
        # https://arxiv.org/pdf/cond-mat/0209450.pdf
        # nx.degree_assortativity_coefficient(g)
        assortativity = nx.degree_pearson_correlation_coefficient(g)
        transitivity = nx.transitivity(g)
#         estrada = nx.estrada_index(g)
        ev = compute_eigenvalues(g)
#         print(ev)
        degrees = [d for n, d in g.degree()]
        part = community.best_partition(g)
        modularity = community.modularity(part, g)
        shortest_path = nx.average_shortest_path_length(g)
        general_measures_df.loc[loc] = [
            movie_id, 
            clustering, 
            assortativity, 
            transitivity, 
            modularity, 
            np.mean(degrees), 
            np.std(degrees), 
            len(g.nodes), 
            len(g.edges),
            shortest_path
        ]
        loc += 1
#         if loc == 10:
#             break

/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/scipy/stats/stats.py:3003: RuntimeWarning: invalid value encountered in double_scalars
  r = r_num / r_den

In [22]:

general_measures_df.head()

Out[22]:

	mg_id	clustering	assortativity	transitivity	modularity	deg_mean	deg_std	nodes	edges	shortest
0	100.0	0.805678	-0.342569	0.280090	0.446111	4.307692	5.598323	39.0	84.0	1.929825
1	101.0	0.677572	-0.385180	0.326568	0.222181	4.714286	4.604789	28.0	66.0	1.976190
2	10.0	0.879083	-0.196563	0.645309	0.462104	7.555556	3.899984	27.0	102.0	1.794872
3	102.0	0.640278	-0.534301	0.222222	0.149324	3.111111	3.381139	18.0	28.0	1.888889
4	103.0	0.701352	-0.311690	0.470588	0.435228	7.071429	7.566036	56.0	198.0	2.096753

In [23]:

general_measures_df.shape

Out[23]:

(772, 10)

Add 'genres' and 'target' columns. Only popular genres are kept

In [24]:

general_measures_df = pd.merge(genres_df, general_measures_df, on='mg_id').dropna()

In [25]:

general_measures_df.shape

Out[25]:

(728, 12)

In [26]:

general_measures_df.head()

Out[26]:

	mg_id	genres	target	clustering	assortativity	transitivity	modularity	deg_mean	deg_std	nodes	edges	shortest
0	101	Comedy	1	0.677572	-0.385180	0.326568	0.222181	4.714286	4.604789	28.0	66.0	1.976190
1	10	Adventure	5	0.879083	-0.196563	0.645309	0.462104	7.555556	3.899984	27.0	102.0	1.794872
2	102	Crime	3	0.640278	-0.534301	0.222222	0.149324	3.111111	3.381139	18.0	28.0	1.888889
3	103	Comedy	1	0.701352	-0.311690	0.470588	0.435228	7.071429	7.566036	56.0	198.0	2.096753
4	104	Adventure	5	0.637863	-0.382850	0.197007	0.335873	4.042553	6.236330	47.0	95.0	1.980574

In [27]:

# general_measures_df.hist()
# plt.show()

Normalize

In [28]:

# mean normalization
def mean_norm(df, columns):
    result = df.copy()
    for feature_name in columns:
        mean_value = df[feature_name].mean()
        std_value = df[feature_name].std()
        result[feature_name] = (df[feature_name] - mean_value) / std_value
    return result

In [29]:

# min-max normalization
def min_max_norm(df, columns):
    result = df.copy()
    for feature_name in columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [30]:

# z-score normalization
from scipy import stats
def zscore_norm(df, columns):
    result = df.dropna().copy()
    for feature_name in columns:
        result[feature_name] = stats.zscore(df[feature_name])
    return result

In [31]:

# normalize
features = ['clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges', 'shortest']
# general_measures_df = mean_norm(general_measures_df, features)

In [32]:

general_measures_df = min_max_norm(general_measures_df, features)

In [34]:

general_measures_df.head()

Out[34]:

	mg_id	genres	target	clustering	assortativity	transitivity	modularity	deg_mean	deg_std	nodes	edges	shortest
0	101	Comedy	1	0.594543	0.414510	0.275433	0.321249	0.162933	0.262021	0.198020	0.090468	0.244690
1	10	Adventure	5	0.944766	0.633743	0.689716	0.671798	0.348697	0.202718	0.188119	0.148627	0.175675
2	102	Crime	3	0.529726	0.241184	0.139809	0.214798	0.058116	0.159062	0.099010	0.029079	0.211461
3	103	Comedy	1	0.635872	0.499929	0.462623	0.632531	0.317045	0.511183	0.475248	0.303716	0.290580
4	104	Adventure	5	0.525530	0.417218	0.107036	0.487364	0.119015	0.399300	0.386139	0.137318	0.246358

In [35]:

general_measures_df.shape

Out[35]:

(728, 12)

Remove outliers (keep only those within -5 and +5 standard deviations)

In [36]:

for feature in features:
    general_measures_df = general_measures_df[np.abs(general_measures_df[feature] - general_measures_df[feature].mean()) <= (5 * general_measures_df[feature].std())]

In [37]:

general_measures_df.shape

Out[37]:

(721, 12)

In [38]:

# general_measures_df.to_csv('mg-features.csv', index=False, float_format="%.2f")

Feature selection

Visualization

In [39]:

main_genres

Out[39]:

['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']

In [47]:

genres_list = ['Biography', 'Adventure', 'Horror']

In [48]:

subset_with_genres = general_measures_df.loc[general_measures_df['genres'].isin(genres_list)]

In [49]:

features_list = ['genres', 'clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges']

In [50]:

viz_df = subset_with_genres[features_list]

In [51]:

viz_df.groupby(['genres']).size()

Out[51]:

genres
Adventure    41
Biography    44
Horror       40
dtype: int64

In [52]:

g = sns.pairplot(viz_df, hue='genres', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)

# g = sns.PairGrid(viz_df, hue='genres', hue_kws={"cmap": ["Blues", "Reds"]})
# g = g.map_offdiag(sns.kdeplot, lw=3)
# g = g.map_diag(sns.kdeplot, lw=1)

Out[52]:

<seaborn.axisgrid.PairGrid at 0x7fa6f8abb198>

Prepare dataset

In [110]:

features = features_list[1:]
data = subset_with_genres[features].as_matrix()
target = subset_with_genres['target'].as_matrix()
data.shape[0] == target.shape[0]

Out[110]:

True

In [111]:

len(features)

Out[111]:

Transformations¶

Plynomial features

In [55]:

# Adventure biography
# modularity x deg_std
# modularity x nodes

In [56]:

# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(degree=3, interaction_only=True)
# poly_df = pd.DataFrame(poly.fit_transform(general_measures_df[features_list[1:]]), columns=poly.get_feature_names())
# poly_df.shape

In [57]:

# viz_df = poly_df.iloc[:,43:53].join(general_measures_df['genres'])
# viz_df = viz_df.loc[viz_df['genres'].isin(genres_list)]
# viz_df.groupby('genres').size()

In [58]:

# g = sns.pairplot(viz_df, hue='genres')

Non-linear transformation

In [59]:

# from sklearn import preprocessing

In [60]:

# quantile_transformer = preprocessing.QuantileTransformer(random_state=0, output_distribution='uniform')
# data_trans = pd.DataFrame(quantile_transformer.fit_transform(general_measures_df[features_list[1:]]))
# data_trans = data_trans.join(general_measures_df['genres'])
# data_trans = data_trans.loc[data_trans['genres'].isin(genres_list)]

In [61]:

# sns.pairplot(data_trans, hue='genres')

Manifold learning

In [62]:

from sklearn import manifold
from matplotlib.ticker import NullFormatter

In [63]:

n_components = 2
n_neighbors = 10

In [64]:

dimX = 0
dimY = 1

In [65]:

# # adopted code from this source: http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html#sphx-glr-auto-examples-manifold-plot-compare-methods-py
fig = plt.figure(figsize=(15, 8))

methods = ['standard', 'ltsa', 'hessian', 'modified']
labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']

# for i, method in enumerate(methods):
#     data_lle_trans = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
#                                         eigen_solver='auto',
#                                         method=method).fit_transform(data)
#     ax = fig.add_subplot(251 + i)
#     plt.scatter(data_lle_trans[:, dimX], data_lle_trans[:, dimY], c=target)
#     plt.title("%s" % labels[i])
# #     ax.xaxis.set_major_formatter(NullFormatter())
# #     ax.yaxis.set_major_formatter(NullFormatter())
#     plt.axis('tight')

ax = fig.add_subplot(256)
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
data_tsne_trans = tsne.fit_transform(data)
plt.scatter(data_tsne_trans[:, dimX], data_tsne_trans[:, dimY], c=target)
plt.title("t-SNE")

ax = fig.add_subplot(257)
mds = manifold.MDS(n_components, max_iter=100, n_init=1)
data_mds_trans = mds.fit_transform(data)
plt.scatter(data_mds_trans[:, dimX], data_mds_trans[:, dimY], c=target)
plt.title("MDS")

ax = fig.add_subplot(258)
se = manifold.SpectralEmbedding(n_components=n_components,
                                n_neighbors=n_neighbors)
data_se_trans = se.fit_transform(data)
plt.scatter(data_se_trans[:, dimX], data_se_trans[:, dimY], c=target)
plt.title("Spectral Embedding")

ax = fig.add_subplot(259)
isomap = manifold.Isomap(n_neighbors, n_components)
data_isomap_trans = isomap.fit_transform(data)
plt.scatter(data_isomap_trans[:, dimX], data_isomap_trans[:, dimY], c=target)
plt.title("Isomap")

Out[65]:

Text(0.5,1,'Isomap')

Add manifold features to the dataset

In [66]:

manifold_features = ['manifoldX', 'manifoldY']
manifold_data = data_mds_trans

In [67]:

data_man = np.concatenate((data, manifold_data), axis=1)
features_man = np.concatenate((features, manifold_features))

Visualize manifold features + topological features

In [69]:

manifold_df = pd.DataFrame(data_man, columns=features_man, dtype='float')

In [70]:

# normalize
manifold_df = mean_norm(manifold_df, manifold_features)

In [71]:

manifold_df['target'] = target.astype(np.float)

In [72]:

g = sns.pairplot(manifold_df, hue='target', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)

Out[72]:

<seaborn.axisgrid.PairGrid at 0x7fa6e607df60>

Supervised¶

Random forest. Grid search

In [73]:

from sklearn.ensemble import RandomForestClassifier

In [74]:

from sklearn.tree import export_graphviz

In [75]:

from sklearn.model_selection import GridSearchCV

In [76]:

rfc = RandomForestClassifier(n_estimators=100)

In [77]:

param_grid = {"max_depth": [3, None],
              "max_features": range(1,len(features)),
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [78]:

grid = GridSearchCV(rfc, param_grid=param_grid)
grid.fit(list(data), list(target))

Out[78]:

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, None], 'max_features': range(1, 8), 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [79]:

grid.best_score_

Out[79]:

0.592

In [80]:

grid.best_params_

Out[80]:

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'min_samples_leaf': 10,
 'min_samples_split': 10}

Build a Random Forest model using the best parameters

In [81]:

rfc = grid.best_estimator_
rfc.fit(list(data), list(target))

Out[81]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [82]:

print('Importance of features')
print(sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), features), reverse=True))

Importance of features
[(0.3857, 'nodes'), (0.1488, 'deg_std'), (0.1339, 'transitivity'), (0.1213, 'edges'), (0.0628, 'assortativity'), (0.0624, 'deg_mean'), (0.0448, 'clustering'), (0.0403, 'modularity')]

In [ ]:

# export_graphviz(rfc.estimators_[1], feature_names=features, class_names=genres_list, filled=True, rounded=True, out_file='tree.dot')

Decision tree. Grid search

In [112]:

from sklearn.tree import DecisionTreeClassifier

In [113]:

dtc = DecisionTreeClassifier()

In [114]:

param_grid = {"max_depth": [3, None],
              "max_features": range(1,len(features)),
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "criterion": ["gini", "entropy"]}

In [115]:

grid = GridSearchCV(dtc, param_grid)

In [116]:

grid.fit(list(data), list(target))

Out[116]:

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, None], 'max_features': range(1, 8), 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [117]:

grid.best_score_

Out[117]:

0.576

In [118]:

grid.best_params_

Out[118]:

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 10}

In [119]:

dtc = grid.best_estimator_

In [120]:

dtc.fit(list(data), list(target))

Out[120]:

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=10,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [121]:

export_graphviz(dtc, feature_names=features, class_names=genres_list, filled=True, rounded=True, out_file='tree.dot')

In [122]:

classes = dtc.predict(data)

In [94]:

import json
def rules(clf, features, labels, node_index=0):
    """Structure of rules in a fit decision tree classifier

    Parameters
    ----------
    clf : DecisionTreeClassifier
        A tree that has already been fit.

    features, labels : lists of str
        The names of the features and labels, respectively.

    """
    node = {}
    if clf.tree_.children_left[node_index] == -1:  # indicates leaf
        count_labels = zip(clf.tree_.value[node_index, 0], labels)
        node['name'] = ', '.join(('{} of {}'.format(int(count), label)
                                 for count, label in count_labels))
#         node['type']='leaf'
#         node['value'] = clf.tree_.value[node_index, 0].tolist()
#         node['error'] = np.float64(clf.tree_.impurity[node_index]).item()
#         node['samples'] = clf.tree_.n_node_samples[node_index]
    else:
        feature = features[clf.tree_.feature[node_index]]
        threshold = clf.tree_.threshold[node_index]
#         node['type']='split'
        error = np.float64(clf.tree_.impurity[node_index]).item()
        node['name'] = '{0} > {1:.2f}'.format(feature, threshold)
#         node['error'] = np.float64(clf.tree_.impurity[node_index]).item()
#         node['samples'] = clf.tree_.n_node_samples[node_index]
#         node['value'] = clf.tree_.value[node_index, 0].tolist()
        left_index = clf.tree_.children_left[node_index]
        right_index = clf.tree_.children_right[node_index]
        node['children'] = [rules(clf, features, labels, right_index),
                            rules(clf, features, labels, left_index)]
        
    return node

class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)

In [95]:

# save decision tree as JSON
d = rules(dtc, features, genres_list)
with open('output.json', 'w') as outfile:  
    json.dump(d, outfile,cls=MyEncoder)

Visualize classification results

In [123]:

viz_df = subset_with_genres[features_list]
viz_df['classes'] = classes
viz_df.groupby('classes').size()

/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[123]:

classes
4    45
5    62
6    18
dtype: int64

In [124]:

classes_df = subset_with_genres
classes_df['classes'] = classes

genre_indexes = []
for i in range(len(genres_list)):
    genre_indexes.append(main_genres_dict[genres_list[i]])

fig = plt.figure(figsize=(20,10))
fig.suptitle('Classification results. Accuracy: {0:.2f}'.format(grid.best_score_))
for c in genre_indexes:
    count_genres = {}
    for genre in genres_list:
        count_genres[genre] = 0
    for genre in list(classes_df.loc[classes_df['classes'] == c]['genres']):
#         for g in genre:
#             count_genres[g] += 1
#     keep only the first genre
        count_genres[genre] +=1

    # # sort before plotting
    import operator
    sorted_genres = sorted(count_genres.items(), key=operator.itemgetter(1), reverse=True)
    titles = []
    counts = []

    for g in sorted_genres:
        titles.append(g[0])
        counts.append(g[1])
    
    
    ax = fig.add_subplot(2,4,c+1)
#     plt.figure(figsize=(5,5))
    ax.bar(titles, counts)
    ax.set_title(class_genre_dict[c])
    
#     data = []
#     for pair in zip(titles, counts):
#         data.append("{{\"genre\":\"{}\", \"count\":{}}}".format(pair[0], pair[1]))
#     print(data)

/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

SVM. Grid search

In [125]:

from sklearn import svm
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
C_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000]
gamma_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000]
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
grid.fit(list(data), list(target))

Out[125]:

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=20, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000], 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [126]:

grid.best_score_

Out[126]:

0.562

In [127]:

grid.best_params_

Out[127]:

{'C': 10000, 'gamma': 0.001}

In [128]:

params = grid.best_params_

Build a SVM model using the best parameters

In [129]:

svm_model = svm.SVC(kernel='rbf', gamma=params['gamma'], C=params['C'])

In [130]:

svm_model.fit(list(data), list(target))

Out[130]:

SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [131]:

# classes = svm_model.predict(data)

Look at the secondary genres of misclassified movies

In [132]:

viz_df = viz_df.join(subset_with_genres['mg_id'])

In [133]:

viz_full_genres_df = pd.merge(viz_df, full_genres_df[['mg_id', 'genres_list']], on='mg_id')

In [135]:

c_ids = viz_full_genres_df.classes.unique()
fig = plt.figure(figsize=(30,10))
for c in c_ids:
    genres_by_class = viz_full_genres_df.loc[viz_full_genres_df['classes'] == c]['genres_list'].values
    genres_by_class = list(filter(lambda l: l[0] != class_genre_dict[c], genres_by_class))
    # keep only secondary genres
    misclassified_genres = [item for sublist in genres_by_class for item in sublist[1:]]

    mg_dict = {}
    for genre in list(set(misclassified_genres)):
        mg_dict[genre] = 0
    
    for genre in misclassified_genres:
        mg_dict[genre] += 1
    
    # # sort before plotting
    import operator
    sorted_genres = sorted(mg_dict.items(), key=operator.itemgetter(1), reverse=True)
    titles = []
    counts = []

    for g in sorted_genres:
        titles.append(g[0])
        counts.append(g[1])
    
    ax = fig.add_subplot(2,4,c+1)
#     plt.figure(figsize=(5,5))
    ax.bar(titles[:5], counts[:5])
    ax.set_title('Misclassified secondary genres for class ' + class_genre_dict[c])