Graph measures

Compute graph measures corresponding to social networks in the movies

In [1]:
import networkx as nx
from matplotlib import pyplot as plt
import numpy.linalg
import pymongo as pm
import pandas as pd
from community import community_louvain as community
import numpy as np
import seaborn as sns
In [2]:
plt.style.use(['dark_background', 'seaborn-talk'])
In [3]:
PATH = './gexf/'

Read ID-TITLE list

In [4]:
fname = "movies-unique.txt"

with open(fname) as f:
    content = f.readlines()
content = [x.strip() for x in content]

import re
movies = {}
for movie in content:
    id = re.search('(^\d+)', movie)
    title = re.search('(-.*)', movie)
    movies[int(id.group(0))] = title.group(0)[1:].replace('-', ' ')

# fix some non-ASCII characters, numbers, and alternative movie titles
movies[38] = 'Alien 3'
movies[873] = 'WALL E'
movies[100] = 'Bad Lieutenant: Port of Call New Orleans'
movies[3] = '12 and Holding'
movies[453] = 'Jennifer 8'
movies[575] = 'Mrs Brown'
movies[647] = 'Postino'
movies[735] = 'Shivers'
del movies[885] # the movie "White Jazz" has never been filmed

MongoDB connector

In [5]:
client = pm.MongoClient()
db = client.moviegalaxies
movies_collection = db.movies

Retrieve genres for each movie

In [6]:
# genres of the movies MG_ID-GENRE
genres = {}
no_genre = []
for movie in movies_collection.find():
    if not movie['Genre'] == 'N/A':
        genres[movie['mg_id']] = movie['Genre'].split(', ')
    else:
        no_genre.append(movie['mg_id'])
print(no_genre, "do(es) not have genre")
[906] do(es) not have genre

Assign labels to each genre. Keep only popular genres

In [7]:
genres_df = pd.DataFrame(columns=['mg_id', 'genres', 'target'])
In [8]:
main_genres = ['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']
In [9]:
main_genres_dict = {main_genres[i]: i for i in range(len(main_genres))}
In [10]:
main_genres_dict
Out[10]:
{'Action': 0,
 'Adventure': 5,
 'Biography': 4,
 'Comedy': 1,
 'Crime': 3,
 'Drama': 2,
 'Horror': 6}
In [11]:
class_genre_dict = dict (zip(main_genres_dict.values(), main_genres_dict.keys()))
In [12]:
l = 0
for movie_id, g in genres.items():
    first_genre = g[0]
    if first_genre in main_genres:
        genres_df.loc[l] = [movie_id, first_genre, main_genres_dict[first_genre]]
        l += 1
In [13]:
genres_df[:10]
Out[13]:
mg_id genres target
0 101 Comedy 1
1 10 Adventure 5
2 102 Crime 3
3 103 Comedy 1
4 104 Adventure 5
5 106 Action 0
6 107 Crime 3
7 108 Biography 4
8 109 Action 0
9 110 Action 0
In [14]:
genres_df.shape
Out[14]:
(731, 3)
In [15]:
genres_df.groupby('genres').count()
Out[15]:
mg_id target
genres
Action 200 200
Adventure 41 41
Biography 45 45
Comedy 171 171
Crime 82 82
Drama 151 151
Horror 41 41
In [16]:
full_genres_df = pd.DataFrame(columns=['mg_id', 'genres_list', 'target'])
In [17]:
l = 0
for movie_id, g in genres.items():
    first_genre = g[0]
    if first_genre in main_genres:
        full_genres_df.loc[l] = [movie_id, g, main_genres_dict[first_genre]]
        l += 1
In [18]:
full_genres_df.head()
Out[18]:
mg_id genres_list target
0 101 [Comedy, Crime, Drama] 1
1 10 [Adventure, Sci-Fi] 5
2 102 [Crime, Drama] 3
3 103 [Comedy, Drama, Music] 1
4 104 [Adventure, Drama, History] 5

Compute network measures

In [19]:
def compute_eigenvalues(g):
    """
    Compute eigenvalues of a graph g.
    Return: eigenvalues of a graph g.
    """
    g = g.to_undirected()
    L = nx.normalized_laplacian_matrix(g)
    e = numpy.linalg.eigvals(L.A)
    return e
In [20]:
general_measures_df = pd.DataFrame(columns=[
    'mg_id', 
    'clustering', 
    'assortativity', 
    'transitivity', 
    'modularity', 
    'deg_mean', 
    'deg_std',
    'nodes',
    'edges',
    'shortest'])
In [21]:
loc = 0
for movie_id, title in movies.items():
#     print(movie_id)
    if not movie_id == 2: # graph '2' is incomplete
        g = nx.read_gexf(PATH + str(movie_id) + '.gexf')
        # compute some measures and store them as a pandas dataframe
#         radius = nx.radius(g)
        clustering = nx.average_clustering(g)
        # the tendency for vertices in networks to be connected to other vertices that are like (or unlike) them in some way.
        # https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.assortativity.html
        # https://arxiv.org/pdf/cond-mat/0209450.pdf
        # nx.degree_assortativity_coefficient(g)
        assortativity = nx.degree_pearson_correlation_coefficient(g)
        transitivity = nx.transitivity(g)
#         estrada = nx.estrada_index(g)
        ev = compute_eigenvalues(g)
#         print(ev)
        degrees = [d for n, d in g.degree()]
        part = community.best_partition(g)
        modularity = community.modularity(part, g)
        shortest_path = nx.average_shortest_path_length(g)
        general_measures_df.loc[loc] = [
            movie_id, 
            clustering, 
            assortativity, 
            transitivity, 
            modularity, 
            np.mean(degrees), 
            np.std(degrees), 
            len(g.nodes), 
            len(g.edges),
            shortest_path
        ]
        loc += 1
#         if loc == 10:
#             break
/home/volodymyrmiz/miniconda3/lib/python3.6/site-packages/scipy/stats/stats.py:3003: RuntimeWarning: invalid value encountered in double_scalars
  r = r_num / r_den
In [22]:
general_measures_df.head()
Out[22]:
mg_id clustering assortativity transitivity modularity deg_mean deg_std nodes edges shortest
0 100.0 0.805678 -0.342569 0.280090 0.446111 4.307692 5.598323 39.0 84.0 1.929825
1 101.0 0.677572 -0.385180 0.326568 0.222181 4.714286 4.604789 28.0 66.0 1.976190
2 10.0 0.879083 -0.196563 0.645309 0.462104 7.555556 3.899984 27.0 102.0 1.794872
3 102.0 0.640278 -0.534301 0.222222 0.149324 3.111111 3.381139 18.0 28.0 1.888889
4 103.0 0.701352 -0.311690 0.470588 0.435228 7.071429 7.566036 56.0 198.0 2.096753
In [23]:
general_measures_df.shape
Out[23]:
(772, 10)

Add 'genres' and 'target' columns. Only popular genres are kept

In [24]:
general_measures_df = pd.merge(genres_df, general_measures_df, on='mg_id').dropna()
In [25]:
general_measures_df.shape
Out[25]:
(728, 12)
In [26]:
general_measures_df.head()
Out[26]:
mg_id genres target clustering assortativity transitivity modularity deg_mean deg_std nodes edges shortest
0 101 Comedy 1 0.677572 -0.385180 0.326568 0.222181 4.714286 4.604789 28.0 66.0 1.976190
1 10 Adventure 5 0.879083 -0.196563 0.645309 0.462104 7.555556 3.899984 27.0 102.0 1.794872
2 102 Crime 3 0.640278 -0.534301 0.222222 0.149324 3.111111 3.381139 18.0 28.0 1.888889
3 103 Comedy 1 0.701352 -0.311690 0.470588 0.435228 7.071429 7.566036 56.0 198.0 2.096753
4 104 Adventure 5 0.637863 -0.382850 0.197007 0.335873 4.042553 6.236330 47.0 95.0 1.980574
In [27]:
# general_measures_df.hist()
# plt.show()

Normalize

In [28]:
# mean normalization
def mean_norm(df, columns):
    result = df.copy()
    for feature_name in columns:
        mean_value = df[feature_name].mean()
        std_value = df[feature_name].std()
        result[feature_name] = (df[feature_name] - mean_value) / std_value
    return result
In [29]:
# min-max normalization
def min_max_norm(df, columns):
    result = df.copy()
    for feature_name in columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result
In [30]:
# z-score normalization
from scipy import stats
def zscore_norm(df, columns):
    result = df.dropna().copy()
    for feature_name in columns:
        result[feature_name] = stats.zscore(df[feature_name])
    return result
In [31]:
# normalize
features = ['clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges', 'shortest']
# general_measures_df = mean_norm(general_measures_df, features)
In [32]:
general_measures_df = min_max_norm(general_measures_df, features)
In [34]:
general_measures_df.head()
Out[34]:
mg_id genres target clustering assortativity transitivity modularity deg_mean deg_std nodes edges shortest
0 101 Comedy 1 0.594543 0.414510 0.275433 0.321249 0.162933 0.262021 0.198020 0.090468 0.244690
1 10 Adventure 5 0.944766 0.633743 0.689716 0.671798 0.348697 0.202718 0.188119 0.148627 0.175675
2 102 Crime 3 0.529726 0.241184 0.139809 0.214798 0.058116 0.159062 0.099010 0.029079 0.211461
3 103 Comedy 1 0.635872 0.499929 0.462623 0.632531 0.317045 0.511183 0.475248 0.303716 0.290580
4 104 Adventure 5 0.525530 0.417218 0.107036 0.487364 0.119015 0.399300 0.386139 0.137318 0.246358
In [35]:
general_measures_df.shape
Out[35]:
(728, 12)

Remove outliers (keep only those within -5 and +5 standard deviations)

In [36]:
for feature in features:
    general_measures_df = general_measures_df[np.abs(general_measures_df[feature] - general_measures_df[feature].mean()) <= (5 * general_measures_df[feature].std())]
In [37]:
general_measures_df.shape
Out[37]:
(721, 12)
In [38]:
# general_measures_df.to_csv('mg-features.csv', index=False, float_format="%.2f")

Feature selection

  • Visualization
In [39]:
main_genres
Out[39]:
['Action', 'Comedy', 'Drama', 'Crime', 'Biography', 'Adventure', 'Horror']
In [47]:
genres_list = ['Biography', 'Adventure', 'Horror']
In [48]:
subset_with_genres = general_measures_df.loc[general_measures_df['genres'].isin(genres_list)]
In [49]:
features_list = ['genres', 'clustering', 'assortativity', 'transitivity', 'modularity', 'deg_mean', 'deg_std', 'nodes', 'edges']
In [50]:
viz_df = subset_with_genres[features_list]
In [51]:
viz_df.groupby(['genres']).size()
Out[51]:
genres
Adventure    41
Biography    44
Horror       40
dtype: int64
In [52]:
g = sns.pairplot(viz_df, hue='genres', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)

# g = sns.PairGrid(viz_df, hue='genres', hue_kws={"cmap": ["Blues", "Reds"]})
# g = g.map_offdiag(sns.kdeplot, lw=3)
# g = g.map_diag(sns.kdeplot, lw=1)
Out[52]:
<seaborn.axisgrid.PairGrid at 0x7fa6f8abb198>

Prepare dataset

In [110]:
features = features_list[1:]
data = subset_with_genres[features].as_matrix()
target = subset_with_genres['target'].as_matrix()
data.shape[0] == target.shape[0]
Out[110]:
True
In [111]:
len(features)
Out[111]:
8

Transformations

  • Plynomial features
In [55]:
# Adventure biography
# modularity x deg_std
# modularity x nodes
In [56]:
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(degree=3, interaction_only=True)
# poly_df = pd.DataFrame(poly.fit_transform(general_measures_df[features_list[1:]]), columns=poly.get_feature_names())
# poly_df.shape
In [57]:
# viz_df = poly_df.iloc[:,43:53].join(general_measures_df['genres'])
# viz_df = viz_df.loc[viz_df['genres'].isin(genres_list)]
# viz_df.groupby('genres').size()
In [58]:
# g = sns.pairplot(viz_df, hue='genres')
  • Non-linear transformation
In [59]:
# from sklearn import preprocessing
In [60]:
# quantile_transformer = preprocessing.QuantileTransformer(random_state=0, output_distribution='uniform')
# data_trans = pd.DataFrame(quantile_transformer.fit_transform(general_measures_df[features_list[1:]]))
# data_trans = data_trans.join(general_measures_df['genres'])
# data_trans = data_trans.loc[data_trans['genres'].isin(genres_list)]
In [61]:
# sns.pairplot(data_trans, hue='genres')
  • Manifold learning
In [62]:
from sklearn import manifold
from matplotlib.ticker import NullFormatter
In [63]:
n_components = 2
n_neighbors = 10
In [64]:
dimX = 0
dimY = 1
In [65]:
# # adopted code from this source: http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html#sphx-glr-auto-examples-manifold-plot-compare-methods-py
fig = plt.figure(figsize=(15, 8))

methods = ['standard', 'ltsa', 'hessian', 'modified']
labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']

# for i, method in enumerate(methods):
#     data_lle_trans = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
#                                         eigen_solver='auto',
#                                         method=method).fit_transform(data)
#     ax = fig.add_subplot(251 + i)
#     plt.scatter(data_lle_trans[:, dimX], data_lle_trans[:, dimY], c=target)
#     plt.title("%s" % labels[i])
# #     ax.xaxis.set_major_formatter(NullFormatter())
# #     ax.yaxis.set_major_formatter(NullFormatter())
#     plt.axis('tight')

ax = fig.add_subplot(256)
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
data_tsne_trans = tsne.fit_transform(data)
plt.scatter(data_tsne_trans[:, dimX], data_tsne_trans[:, dimY], c=target)
plt.title("t-SNE")

ax = fig.add_subplot(257)
mds = manifold.MDS(n_components, max_iter=100, n_init=1)
data_mds_trans = mds.fit_transform(data)
plt.scatter(data_mds_trans[:, dimX], data_mds_trans[:, dimY], c=target)
plt.title("MDS")

ax = fig.add_subplot(258)
se = manifold.SpectralEmbedding(n_components=n_components,
                                n_neighbors=n_neighbors)
data_se_trans = se.fit_transform(data)
plt.scatter(data_se_trans[:, dimX], data_se_trans[:, dimY], c=target)
plt.title("Spectral Embedding")

ax = fig.add_subplot(259)
isomap = manifold.Isomap(n_neighbors, n_components)
data_isomap_trans = isomap.fit_transform(data)
plt.scatter(data_isomap_trans[:, dimX], data_isomap_trans[:, dimY], c=target)
plt.title("Isomap")
Out[65]:
Text(0.5,1,'Isomap')

Add manifold features to the dataset

In [66]:
manifold_features = ['manifoldX', 'manifoldY']
manifold_data = data_mds_trans
In [67]:
data_man = np.concatenate((data, manifold_data), axis=1)
features_man = np.concatenate((features, manifold_features))
  • Visualize manifold features + topological features
In [69]:
manifold_df = pd.DataFrame(data_man, columns=features_man, dtype='float')
In [70]:
# normalize
manifold_df = mean_norm(manifold_df, manifold_features)
In [71]:
manifold_df['target'] = target.astype(np.float)
In [72]:
g = sns.pairplot(manifold_df, hue='target', plot_kws={"s":20}, palette="hls")
g.map_upper(sns.regplot)
Out[72]:
<seaborn.axisgrid.PairGrid at 0x7fa6e607df60>