# Graph measures¶

Compute graph measures corresponding to social networks in the movies

In [1]:
import networkx as nx
from matplotlib import pyplot as plt
import numpy.linalg
import pymongo as pm
import pandas as pd
from community import community_louvain as community
import numpy as np
import seaborn as sns

In [2]:
plt.style.use(['dark_background', 'seaborn-talk'])

In [3]:
PATH = './gexf/'


In [4]:
fname = "movies-unique.txt"

with open(fname) as f:
content = [x.strip() for x in content]

import re
movies = {}
for movie in content:
id = re.search('(^\d+)', movie)
title = re.search('(-.*)', movie)
movies[int(id.group(0))] = title.group(0)[1:].replace('-', ' ')

# fix some non-ASCII characters, numbers, and alternative movie titles
movies[38] = 'Alien 3'
movies[873] = 'WALL E'
movies[100] = 'Bad Lieutenant: Port of Call New Orleans'
movies[3] = '12 and Holding'
movies[453] = 'Jennifer 8'
movies[575] = 'Mrs Brown'
movies[647] = 'Postino'
movies[735] = 'Shivers'
del movies[885] # the movie "White Jazz" has never been filmed


MongoDB connector

In [5]:
client = pm.MongoClient()
db = client.moviegalaxies
movies_collection = db.movies


Retrieve genres for each movie

In [6]:
# genres of the movies MG_ID-GENRE
genres = {}
no_genre = []
for movie in movies_collection.find():
if not movie['Genre'] == 'N/A':
genres[movie['mg_id']] = movie['Genre'].split(', ')
else:
no_genre.append(movie['mg_id'])
print(no_genre, "do(es) not have genre")

[906] do(es) not have genre


Assign labels to each genre. Keep only popular genres

In [7]:
genres_df = pd.DataFrame(columns=['mg_id', 'genres', 'target'])

In [8]:
main_genres = ['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']

In [9]:
main_genres_dict = {main_genres[i]: i for i in range(len(main_genres))}

In [10]:
main_genres_dict

Out[10]:
{'Action': 0,
'Biography': 4,
'Comedy': 1,
'Crime': 3,
'Drama': 2,
'Horror': 6}
In [11]:
class_genre_dict = dict (zip(main_genres_dict.values(), main_genres_dict.keys()))

In [12]:
l = 0
for movie_id, g in genres.items():
first_genre = g[0]
if first_genre in main_genres:
genres_df.loc[l] = [movie_id, first_genre, main_genres_dict[first_genre]]
l += 1

In [13]:
genres_df[:10]

Out[13]:
mg_id genres target
0 101 Comedy 1
2 102 Crime 3
3 103 Comedy 1
5 106 Action 0
6 107 Crime 3
7 108 Biography 4
8 109 Action 0
9 110 Action 0
In [14]:
genres_df.shape

Out[14]:
(731, 3)
In [15]:
genres_df.groupby('genres').count()

Out[15]:
mg_id target
genres
Action 200 200
Biography 45 45
Comedy 171 171
Crime 82 82
Drama 151 151
Horror 41 41
In [16]:
full_genres_df = pd.DataFrame(columns=['mg_id', 'genres_list', 'target'])

In [17]:
l = 0
for movie_id, g in genres.items():
first_genre = g[0]
if first_genre in main_genres:
full_genres_df.loc[l] = [movie_id, g, main_genres_dict[first_genre]]
l += 1

In [18]:
full_genres_df.head()

Out[18]:
mg_id genres_list target
0 101 [Comedy, Crime, Drama] 1
2 102 [Crime, Drama] 3
3 103 [Comedy, Drama, Music] 1
4 104 [Adventure, Drama, History] 5

Compute network measures

In [19]:
def compute_eigenvalues(g):
"""
Compute eigenvalues of a graph g.
Return: eigenvalues of a graph g.
"""
g = g.to_undirected()
L = nx.normalized_laplacian_matrix(g)
e = numpy.linalg.eigvals(L.A)
return e

In [20]:
general_measures_df = pd.DataFrame(columns=[
'mg_id',
'clustering',
'assortativity',
'transitivity',
'modularity',
'deg_mean',
'deg_std',
'nodes',
'edges',
'shortest'])

In [21]:
loc = 0
for movie_id, title in movies.items():
#     print(movie_id)
if not movie_id == 2: # graph '2' is incomplete
g = nx.read_gexf(PATH + str(movie_id) + '.gexf')
# compute some measures and store them as a pandas dataframe
clustering = nx.average_clustering(g)
# the tendency for vertices in networks to be connected to other vertices that are like (or unlike) them in some way.
# https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.assortativity.html
# https://arxiv.org/pdf/cond-mat/0209450.pdf
# nx.degree_assortativity_coefficient(g)
assortativity = nx.degree_pearson_correlation_coefficient(g)
transitivity = nx.transitivity(g)
ev = compute_eigenvalues(g)
#         print(ev)
degrees = [d for n, d in g.degree()]
part = community.best_partition(g)
modularity = community.modularity(part, g)
shortest_path = nx.average_shortest_path_length(g)
general_measures_df.loc[loc] = [
movie_id,
clustering,
assortativity,
transitivity,
modularity,
np.mean(degrees),
np.std(degrees),
len(g.nodes),
len(g.edges),
shortest_path
]
loc += 1
#         if loc == 10:
#             break

/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/scipy/stats/stats.py:3003: RuntimeWarning: invalid value encountered in double_scalars
r = r_num / r_den

In [22]:
general_measures_df.head()

Out[22]:
mg_id clustering assortativity transitivity modularity deg_mean deg_std nodes edges shortest
0 100.0 0.805678 -0.342569 0.280090 0.446111 4.307692 5.598323 39.0 84.0 1.929825
1 101.0 0.677572 -0.385180 0.326568 0.222181 4.714286 4.604789 28.0 66.0 1.976190
2 10.0 0.879083 -0.196563 0.645309 0.462104 7.555556 3.899984 27.0 102.0 1.794872
3 102.0 0.640278 -0.534301 0.222222 0.149324 3.111111 3.381139 18.0 28.0 1.888889
4 103.0 0.701352 -0.311690 0.470588 0.435228 7.071429 7.566036 56.0 198.0 2.096753
In [23]:
general_measures_df.shape

Out[23]:
(772, 10)

Add 'genres' and 'target' columns. Only popular genres are kept

In [24]:
general_measures_df = pd.merge(genres_df, general_measures_df, on='mg_id').dropna()

In [25]:
general_measures_df.shape

Out[25]:
(728, 12)
In [26]:
general_measures_df.head()

Out[26]:
mg_id genres target clustering assortativity transitivity modularity deg_mean deg_std nodes edges shortest
0 101 Comedy 1 0.677572 -0.385180 0.326568 0.222181 4.714286 4.604789 28.0 66.0 1.976190
1 10 Adventure 5 0.879083 -0.196563 0.645309 0.462104 7.555556 3.899984 27.0 102.0 1.794872
2 102 Crime 3 0.640278 -0.534301 0.222222 0.149324 3.111111 3.381139 18.0 28.0 1.888889
3 103 Comedy 1 0.701352 -0.311690 0.470588 0.435228 7.071429 7.566036 56.0 198.0 2.096753
4 104 Adventure 5 0.637863 -0.382850 0.197007 0.335873 4.042553 6.236330 47.0 95.0 1.980574
In [27]:
# general_measures_df.hist()
# plt.show()


Normalize

In [28]:
# mean normalization
def mean_norm(df, columns):
result = df.copy()
for feature_name in columns:
mean_value = df[feature_name].mean()
std_value = df[feature_name].std()
result[feature_name] = (df[feature_name] - mean_value) / std_value
return result

In [29]:
# min-max normalization
def min_max_norm(df, columns):
result = df.copy()
for feature_name in columns:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result

In [30]:
# z-score normalization
from scipy import stats
def zscore_norm(df, columns):
result = df.dropna().copy()
for feature_name in columns:
result[feature_name] = stats.zscore(df[feature_name])
return result

In [31]:
# normalize
features = ['clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges', 'shortest']
# general_measures_df = mean_norm(general_measures_df, features)

In [32]:
general_measures_df = min_max_norm(general_measures_df, features)

In [34]:
general_measures_df.head()

Out[34]:
mg_id genres target clustering assortativity transitivity modularity deg_mean deg_std nodes edges shortest
0 101 Comedy 1 0.594543 0.414510 0.275433 0.321249 0.162933 0.262021 0.198020 0.090468 0.244690
1 10 Adventure 5 0.944766 0.633743 0.689716 0.671798 0.348697 0.202718 0.188119 0.148627 0.175675
2 102 Crime 3 0.529726 0.241184 0.139809 0.214798 0.058116 0.159062 0.099010 0.029079 0.211461
3 103 Comedy 1 0.635872 0.499929 0.462623 0.632531 0.317045 0.511183 0.475248 0.303716 0.290580
4 104 Adventure 5 0.525530 0.417218 0.107036 0.487364 0.119015 0.399300 0.386139 0.137318 0.246358
In [35]:
general_measures_df.shape

Out[35]:
(728, 12)

Remove outliers (keep only those within -5 and +5 standard deviations)

In [36]:
for feature in features:
general_measures_df = general_measures_df[np.abs(general_measures_df[feature] - general_measures_df[feature].mean()) <= (5 * general_measures_df[feature].std())]

In [37]:
general_measures_df.shape

Out[37]:
(721, 12)
In [38]:
# general_measures_df.to_csv('mg-features.csv', index=False, float_format="%.2f")


Feature selection

• Visualization
In [39]:
main_genres

Out[39]:
['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']
In [47]:
genres_list = ['Biography', 'Adventure', 'Horror']

In [48]:
subset_with_genres = general_measures_df.loc[general_measures_df['genres'].isin(genres_list)]

In [49]:
features_list = ['genres', 'clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges']

In [50]:
viz_df = subset_with_genres[features_list]

In [51]:
viz_df.groupby(['genres']).size()

Out[51]:
genres
Biography    44
Horror       40
dtype: int64
In [52]:
g = sns.pairplot(viz_df, hue='genres', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)

# g = sns.PairGrid(viz_df, hue='genres', hue_kws={"cmap": ["Blues", "Reds"]})
# g = g.map_offdiag(sns.kdeplot, lw=3)
# g = g.map_diag(sns.kdeplot, lw=1)

Out[52]:
<seaborn.axisgrid.PairGrid at 0x7fa6f8abb198>

Prepare dataset

In [110]:
features = features_list[1:]
data = subset_with_genres[features].as_matrix()
target = subset_with_genres['target'].as_matrix()
data.shape[0] == target.shape[0]

Out[110]:
True
In [111]:
len(features)

Out[111]:
8

### Transformations¶

• Plynomial features
In [55]:
# Adventure biography
# modularity x deg_std
# modularity x nodes

In [56]:
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(degree=3, interaction_only=True)
# poly_df = pd.DataFrame(poly.fit_transform(general_measures_df[features_list[1:]]), columns=poly.get_feature_names())
# poly_df.shape

In [57]:
# viz_df = poly_df.iloc[:,43:53].join(general_measures_df['genres'])
# viz_df = viz_df.loc[viz_df['genres'].isin(genres_list)]
# viz_df.groupby('genres').size()

In [58]:
# g = sns.pairplot(viz_df, hue='genres')

• Non-linear transformation
In [59]:
# from sklearn import preprocessing

In [60]:
# quantile_transformer = preprocessing.QuantileTransformer(random_state=0, output_distribution='uniform')
# data_trans = pd.DataFrame(quantile_transformer.fit_transform(general_measures_df[features_list[1:]]))
# data_trans = data_trans.join(general_measures_df['genres'])
# data_trans = data_trans.loc[data_trans['genres'].isin(genres_list)]

In [61]:
# sns.pairplot(data_trans, hue='genres')

• Manifold learning
In [62]:
from sklearn import manifold
from matplotlib.ticker import NullFormatter

In [63]:
n_components = 2
n_neighbors = 10

In [64]:
dimX = 0
dimY = 1

In [65]:
# # adopted code from this source: http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html#sphx-glr-auto-examples-manifold-plot-compare-methods-py
fig = plt.figure(figsize=(15, 8))

methods = ['standard', 'ltsa', 'hessian', 'modified']
labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']

# for i, method in enumerate(methods):
#     data_lle_trans = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
#                                         eigen_solver='auto',
#                                         method=method).fit_transform(data)
#     ax = fig.add_subplot(251 + i)
#     plt.scatter(data_lle_trans[:, dimX], data_lle_trans[:, dimY], c=target)
#     plt.title("%s" % labels[i])
# #     ax.xaxis.set_major_formatter(NullFormatter())
# #     ax.yaxis.set_major_formatter(NullFormatter())
#     plt.axis('tight')

tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
data_tsne_trans = tsne.fit_transform(data)
plt.scatter(data_tsne_trans[:, dimX], data_tsne_trans[:, dimY], c=target)
plt.title("t-SNE")

mds = manifold.MDS(n_components, max_iter=100, n_init=1)
data_mds_trans = mds.fit_transform(data)
plt.scatter(data_mds_trans[:, dimX], data_mds_trans[:, dimY], c=target)
plt.title("MDS")

se = manifold.SpectralEmbedding(n_components=n_components,
n_neighbors=n_neighbors)
data_se_trans = se.fit_transform(data)
plt.scatter(data_se_trans[:, dimX], data_se_trans[:, dimY], c=target)
plt.title("Spectral Embedding")

isomap = manifold.Isomap(n_neighbors, n_components)
data_isomap_trans = isomap.fit_transform(data)
plt.scatter(data_isomap_trans[:, dimX], data_isomap_trans[:, dimY], c=target)
plt.title("Isomap")

Out[65]:
Text(0.5,1,'Isomap')

Add manifold features to the dataset

In [66]:
manifold_features = ['manifoldX', 'manifoldY']
manifold_data = data_mds_trans

In [67]:
data_man = np.concatenate((data, manifold_data), axis=1)
features_man = np.concatenate((features, manifold_features))

• Visualize manifold features + topological features
In [69]:
manifold_df = pd.DataFrame(data_man, columns=features_man, dtype='float')

In [70]:
# normalize
manifold_df = mean_norm(manifold_df, manifold_features)

In [71]:
manifold_df['target'] = target.astype(np.float)

In [72]:
g = sns.pairplot(manifold_df, hue='target', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)

Out[72]:
<seaborn.axisgrid.PairGrid at 0x7fa6e607df60>