#!/usr/bin/env python # coding: utf-8 # # Graph measures # Compute graph measures corresponding to social networks in the movies # In[1]: import networkx as nx from matplotlib import pyplot as plt import numpy.linalg import pymongo as pm import pandas as pd from community import community_louvain as community import numpy as np import seaborn as sns # In[2]: plt.style.use(['dark_background', 'seaborn-talk']) # In[3]: PATH = './gexf/' # Read ID-TITLE list # In[4]: fname = "movies-unique.txt" with open(fname) as f: content = f.readlines() content = [x.strip() for x in content] import re movies = {} for movie in content: id = re.search('(^\d+)', movie) title = re.search('(-.*)', movie) movies[int(id.group(0))] = title.group(0)[1:].replace('-', ' ') # fix some non-ASCII characters, numbers, and alternative movie titles movies[38] = 'Alien 3' movies[873] = 'WALL E' movies[100] = 'Bad Lieutenant: Port of Call New Orleans' movies[3] = '12 and Holding' movies[453] = 'Jennifer 8' movies[575] = 'Mrs Brown' movies[647] = 'Postino' movies[735] = 'Shivers' del movies[885] # the movie "White Jazz" has never been filmed # MongoDB connector # In[5]: client = pm.MongoClient() db = client.moviegalaxies movies_collection = db.movies # Retrieve genres for each movie # In[6]: # genres of the movies MG_ID-GENRE genres = {} no_genre = [] for movie in movies_collection.find(): if not movie['Genre'] == 'N/A': genres[movie['mg_id']] = movie['Genre'].split(', ') else: no_genre.append(movie['mg_id']) print(no_genre, "do(es) not have genre") # Assign labels to each genre. Keep only popular genres # In[7]: genres_df = pd.DataFrame(columns=['mg_id', 'genres', 'target']) # In[8]: main_genres = ['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror'] # In[9]: main_genres_dict = {main_genres[i]: i for i in range(len(main_genres))} # In[10]: main_genres_dict # In[11]: class_genre_dict = dict (zip(main_genres_dict.values(), main_genres_dict.keys())) # In[12]: l = 0 for movie_id, g in genres.items(): first_genre = g[0] if first_genre in main_genres: genres_df.loc[l] = [movie_id, first_genre, main_genres_dict[first_genre]] l += 1 # In[13]: genres_df[:10] # In[14]: genres_df.shape # In[15]: genres_df.groupby('genres').count() # In[16]: full_genres_df = pd.DataFrame(columns=['mg_id', 'genres_list', 'target']) # In[17]: l = 0 for movie_id, g in genres.items(): first_genre = g[0] if first_genre in main_genres: full_genres_df.loc[l] = [movie_id, g, main_genres_dict[first_genre]] l += 1 # In[18]: full_genres_df.head() # Compute network measures # In[19]: def compute_eigenvalues(g): """ Compute eigenvalues of a graph g. Return: eigenvalues of a graph g. """ g = g.to_undirected() L = nx.normalized_laplacian_matrix(g) e = numpy.linalg.eigvals(L.A) return e # In[20]: general_measures_df = pd.DataFrame(columns=[ 'mg_id', 'clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges', 'shortest']) # In[21]: loc = 0 for movie_id, title in movies.items(): # print(movie_id) if not movie_id == 2: # graph '2' is incomplete g = nx.read_gexf(PATH + str(movie_id) + '.gexf') # compute some measures and store them as a pandas dataframe # radius = nx.radius(g) clustering = nx.average_clustering(g) # the tendency for vertices in networks to be connected to other vertices that are like (or unlike) them in some way. # https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.assortativity.html # https://arxiv.org/pdf/cond-mat/0209450.pdf # nx.degree_assortativity_coefficient(g) assortativity = nx.degree_pearson_correlation_coefficient(g) transitivity = nx.transitivity(g) # estrada = nx.estrada_index(g) ev = compute_eigenvalues(g) # print(ev) degrees = [d for n, d in g.degree()] part = community.best_partition(g) modularity = community.modularity(part, g) shortest_path = nx.average_shortest_path_length(g) general_measures_df.loc[loc] = [ movie_id, clustering, assortativity, transitivity, modularity, np.mean(degrees), np.std(degrees), len(g.nodes), len(g.edges), shortest_path ] loc += 1 # if loc == 10: # break # In[22]: general_measures_df.head() # In[23]: general_measures_df.shape # Add 'genres' and 'target' columns. Only popular genres are kept # In[24]: general_measures_df = pd.merge(genres_df, general_measures_df, on='mg_id').dropna() # In[25]: general_measures_df.shape # In[26]: general_measures_df.head() # In[27]: # general_measures_df.hist() # plt.show() # Normalize # In[28]: # mean normalization def mean_norm(df, columns): result = df.copy() for feature_name in columns: mean_value = df[feature_name].mean() std_value = df[feature_name].std() result[feature_name] = (df[feature_name] - mean_value) / std_value return result # In[29]: # min-max normalization def min_max_norm(df, columns): result = df.copy() for feature_name in columns: max_value = df[feature_name].max() min_value = df[feature_name].min() result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value) return result # In[30]: # z-score normalization from scipy import stats def zscore_norm(df, columns): result = df.dropna().copy() for feature_name in columns: result[feature_name] = stats.zscore(df[feature_name]) return result # In[31]: # normalize features = ['clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges', 'shortest'] # general_measures_df = mean_norm(general_measures_df, features) # In[32]: general_measures_df = min_max_norm(general_measures_df, features) # In[34]: general_measures_df.head() # In[35]: general_measures_df.shape # Remove outliers (keep only those within -5 and +5 standard deviations) # In[36]: for feature in features: general_measures_df = general_measures_df[np.abs(general_measures_df[feature] - general_measures_df[feature].mean()) <= (5 * general_measures_df[feature].std())] # In[37]: general_measures_df.shape # In[38]: # general_measures_df.to_csv('mg-features.csv', index=False, float_format="%.2f") # Feature selection # * Visualization # In[39]: main_genres # In[47]: genres_list = ['Biography', 'Adventure', 'Horror'] # In[48]: subset_with_genres = general_measures_df.loc[general_measures_df['genres'].isin(genres_list)] # In[49]: features_list = ['genres', 'clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges'] # In[50]: viz_df = subset_with_genres[features_list] # In[51]: viz_df.groupby(['genres']).size() # In[52]: g = sns.pairplot(viz_df, hue='genres', plot_kws={"s":20}, palette="hls") g.map_upper(sns.regplot) # g = sns.PairGrid(viz_df, hue='genres', hue_kws={"cmap": ["Blues", "Reds"]}) # g = g.map_offdiag(sns.kdeplot, lw=3) # g = g.map_diag(sns.kdeplot, lw=1) # Prepare dataset # In[110]: features = features_list[1:] data = subset_with_genres[features].as_matrix() target = subset_with_genres['target'].as_matrix() data.shape[0] == target.shape[0] # In[111]: len(features) # ### Transformations # * Plynomial features # In[55]: # Adventure biography # modularity x deg_std # modularity x nodes # In[56]: # from sklearn.preprocessing import PolynomialFeatures # poly = PolynomialFeatures(degree=3, interaction_only=True) # poly_df = pd.DataFrame(poly.fit_transform(general_measures_df[features_list[1:]]), columns=poly.get_feature_names()) # poly_df.shape # In[57]: # viz_df = poly_df.iloc[:,43:53].join(general_measures_df['genres']) # viz_df = viz_df.loc[viz_df['genres'].isin(genres_list)] # viz_df.groupby('genres').size() # In[58]: # g = sns.pairplot(viz_df, hue='genres') # * Non-linear transformation # In[59]: # from sklearn import preprocessing # In[60]: # quantile_transformer = preprocessing.QuantileTransformer(random_state=0, output_distribution='uniform') # data_trans = pd.DataFrame(quantile_transformer.fit_transform(general_measures_df[features_list[1:]])) # data_trans = data_trans.join(general_measures_df['genres']) # data_trans = data_trans.loc[data_trans['genres'].isin(genres_list)] # In[61]: # sns.pairplot(data_trans, hue='genres') # * Manifold learning # In[62]: from sklearn import manifold from matplotlib.ticker import NullFormatter # In[63]: n_components = 2 n_neighbors = 10 # In[64]: dimX = 0 dimY = 1 # In[65]: # # adopted code from this source: http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html#sphx-glr-auto-examples-manifold-plot-compare-methods-py fig = plt.figure(figsize=(15, 8)) methods = ['standard', 'ltsa', 'hessian', 'modified'] labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE'] # for i, method in enumerate(methods): # data_lle_trans = manifold.LocallyLinearEmbedding(n_neighbors, n_components, # eigen_solver='auto', # method=method).fit_transform(data) # ax = fig.add_subplot(251 + i) # plt.scatter(data_lle_trans[:, dimX], data_lle_trans[:, dimY], c=target) # plt.title("%s" % labels[i]) # # ax.xaxis.set_major_formatter(NullFormatter()) # # ax.yaxis.set_major_formatter(NullFormatter()) # plt.axis('tight') ax = fig.add_subplot(256) tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0) data_tsne_trans = tsne.fit_transform(data) plt.scatter(data_tsne_trans[:, dimX], data_tsne_trans[:, dimY], c=target) plt.title("t-SNE") ax = fig.add_subplot(257) mds = manifold.MDS(n_components, max_iter=100, n_init=1) data_mds_trans = mds.fit_transform(data) plt.scatter(data_mds_trans[:, dimX], data_mds_trans[:, dimY], c=target) plt.title("MDS") ax = fig.add_subplot(258) se = manifold.SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors) data_se_trans = se.fit_transform(data) plt.scatter(data_se_trans[:, dimX], data_se_trans[:, dimY], c=target) plt.title("Spectral Embedding") ax = fig.add_subplot(259) isomap = manifold.Isomap(n_neighbors, n_components) data_isomap_trans = isomap.fit_transform(data) plt.scatter(data_isomap_trans[:, dimX], data_isomap_trans[:, dimY], c=target) plt.title("Isomap") # Add manifold features to the dataset # In[66]: manifold_features = ['manifoldX', 'manifoldY'] manifold_data = data_mds_trans # In[67]: data_man = np.concatenate((data, manifold_data), axis=1) features_man = np.concatenate((features, manifold_features)) # * Visualize manifold features + topological features # In[69]: manifold_df = pd.DataFrame(data_man, columns=features_man, dtype='float') # In[70]: # normalize manifold_df = mean_norm(manifold_df, manifold_features) # In[71]: manifold_df['target'] = target.astype(np.float) # In[72]: g = sns.pairplot(manifold_df, hue='target', plot_kws={"s":20}, palette="hls") g.map_upper(sns.regplot) # ### Supervised # * Random forest. Grid search # In[73]: from sklearn.ensemble import RandomForestClassifier # In[74]: from sklearn.tree import export_graphviz # In[75]: from sklearn.model_selection import GridSearchCV # In[76]: rfc = RandomForestClassifier(n_estimators=100) # In[77]: param_grid = {"max_depth": [3, None], "max_features": range(1,len(features)), "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # In[78]: grid = GridSearchCV(rfc, param_grid=param_grid) grid.fit(list(data), list(target)) # In[79]: grid.best_score_ # In[80]: grid.best_params_ # * Build a Random Forest model using the best parameters # In[81]: rfc = grid.best_estimator_ rfc.fit(list(data), list(target)) # In[82]: print('Importance of features') print(sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), features), reverse=True)) # In[ ]: # export_graphviz(rfc.estimators_[1], feature_names=features, class_names=genres_list, filled=True, rounded=True, out_file='tree.dot') # * Decision tree. Grid search # In[112]: from sklearn.tree import DecisionTreeClassifier # In[113]: dtc = DecisionTreeClassifier() # In[114]: param_grid = {"max_depth": [3, None], "max_features": range(1,len(features)), "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "criterion": ["gini", "entropy"]} # In[115]: grid = GridSearchCV(dtc, param_grid) # In[116]: grid.fit(list(data), list(target)) # In[117]: grid.best_score_ # In[118]: grid.best_params_ # In[119]: dtc = grid.best_estimator_ # In[120]: dtc.fit(list(data), list(target)) # In[121]: export_graphviz(dtc, feature_names=features, class_names=genres_list, filled=True, rounded=True, out_file='tree.dot') # In[122]: classes = dtc.predict(data) # In[94]: import json def rules(clf, features, labels, node_index=0): """Structure of rules in a fit decision tree classifier Parameters ---------- clf : DecisionTreeClassifier A tree that has already been fit. features, labels : lists of str The names of the features and labels, respectively. """ node = {} if clf.tree_.children_left[node_index] == -1: # indicates leaf count_labels = zip(clf.tree_.value[node_index, 0], labels) node['name'] = ', '.join(('{} of {}'.format(int(count), label) for count, label in count_labels)) # node['type']='leaf' # node['value'] = clf.tree_.value[node_index, 0].tolist() # node['error'] = np.float64(clf.tree_.impurity[node_index]).item() # node['samples'] = clf.tree_.n_node_samples[node_index] else: feature = features[clf.tree_.feature[node_index]] threshold = clf.tree_.threshold[node_index] # node['type']='split' error = np.float64(clf.tree_.impurity[node_index]).item() node['name'] = '{0} > {1:.2f}'.format(feature, threshold) # node['error'] = np.float64(clf.tree_.impurity[node_index]).item() # node['samples'] = clf.tree_.n_node_samples[node_index] # node['value'] = clf.tree_.value[node_index, 0].tolist() left_index = clf.tree_.children_left[node_index] right_index = clf.tree_.children_right[node_index] node['children'] = [rules(clf, features, labels, right_index), rules(clf, features, labels, left_index)] return node class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() else: return super(MyEncoder, self).default(obj) # In[95]: # save decision tree as JSON d = rules(dtc, features, genres_list) with open('output.json', 'w') as outfile: json.dump(d, outfile,cls=MyEncoder) # Visualize classification results # In[123]: viz_df = subset_with_genres[features_list] viz_df['classes'] = classes viz_df.groupby('classes').size() # In[124]: classes_df = subset_with_genres classes_df['classes'] = classes genre_indexes = [] for i in range(len(genres_list)): genre_indexes.append(main_genres_dict[genres_list[i]]) fig = plt.figure(figsize=(20,10)) fig.suptitle('Classification results. Accuracy: {0:.2f}'.format(grid.best_score_)) for c in genre_indexes: count_genres = {} for genre in genres_list: count_genres[genre] = 0 for genre in list(classes_df.loc[classes_df['classes'] == c]['genres']): # for g in genre: # count_genres[g] += 1 # keep only the first genre count_genres[genre] +=1 # # sort before plotting import operator sorted_genres = sorted(count_genres.items(), key=operator.itemgetter(1), reverse=True) titles = [] counts = [] for g in sorted_genres: titles.append(g[0]) counts.append(g[1]) ax = fig.add_subplot(2,4,c+1) # plt.figure(figsize=(5,5)) ax.bar(titles, counts) ax.set_title(class_genre_dict[c]) # data = [] # for pair in zip(titles, counts): # data.append("{{\"genre\":\"{}\", \"count\":{}}}".format(pair[0], pair[1])) # print(data) # * SVM. Grid search # In[125]: from sklearn import svm from sklearn.model_selection import StratifiedShuffleSplit from sklearn.model_selection import GridSearchCV C_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000] gamma_range = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1, 10, 100, 1000, 10000] param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv) grid.fit(list(data), list(target)) # In[126]: grid.best_score_ # In[127]: grid.best_params_ # In[128]: params = grid.best_params_ # Build a SVM model using the best parameters # In[129]: svm_model = svm.SVC(kernel='rbf', gamma=params['gamma'], C=params['C']) # In[130]: svm_model.fit(list(data), list(target)) # In[131]: # classes = svm_model.predict(data) # Look at the secondary genres of misclassified movies # In[132]: viz_df = viz_df.join(subset_with_genres['mg_id']) # In[133]: viz_full_genres_df = pd.merge(viz_df, full_genres_df[['mg_id', 'genres_list']], on='mg_id') # In[135]: c_ids = viz_full_genres_df.classes.unique() fig = plt.figure(figsize=(30,10)) for c in c_ids: genres_by_class = viz_full_genres_df.loc[viz_full_genres_df['classes'] == c]['genres_list'].values genres_by_class = list(filter(lambda l: l[0] != class_genre_dict[c], genres_by_class)) # keep only secondary genres misclassified_genres = [item for sublist in genres_by_class for item in sublist[1:]] mg_dict = {} for genre in list(set(misclassified_genres)): mg_dict[genre] = 0 for genre in misclassified_genres: mg_dict[genre] += 1 # # sort before plotting import operator sorted_genres = sorted(mg_dict.items(), key=operator.itemgetter(1), reverse=True) titles = [] counts = [] for g in sorted_genres: titles.append(g[0]) counts.append(g[1]) ax = fig.add_subplot(2,4,c+1) # plt.figure(figsize=(5,5)) ax.bar(titles[:5], counts[:5]) ax.set_title('Misclassified secondary genres for class ' + class_genre_dict[c])