Süha Kagan Köse - Tao Sun - Xiangzhe Meng - Xingce Bao
In this notebook, we did the data preparation work for generating the two network graphs:
Random network
Circle network
import pandas as pd
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.mode.chained_assignment = None
df_product = pd.read_pickle('data/product_also_bought.pkl')
df_review = pd.read_pickle('data/review_also_bought.pkl')
print("Ultimately, we decide to keep {} electronic products to conduct our project".format(df_product.shape[0]))
Ultimately, we decide to keep 43276 electronic products to conduct our project
# function for collecting a list of products related to a list of products
def get_all_related_asin_from_list_asin(list_asin):
res = []
for asin in list_asin:
res.extend(df_product.loc[asin]['also_bought'])
res = list(set(res))
return res
nb_layer
denotes the number of layers that we went into the dataframe to collect all the products related our chosen product(Macbook or Surface).
After trying several values for nb_layer, we decided to use 2-layer search which can give us the most intuitionistic and impressive network graph to present the two clusters.
# Macbook
nb_layer = 2
df_mac = df_product[df_product.index == 'B005CWJB5G']
list_asin = ['B005CWJB5G']
for i in range(nb_layer):
index = get_all_related_asin_from_list_asin(list_asin)
df_mac = pd.concat([df_mac,df_product.loc[index]])
list_asin = index
df_mac = df_mac.reset_index().drop_duplicates(subset='asin')
df_mac.shape
(743, 2)
# Surface
nb_layer = 2
df_surface = df_product[df_product.index == 'B00FG7MVJU']
list_asin = ['B00FG7MVJU']
for i in range(nb_layer):
index = get_all_related_asin_from_list_asin(list_asin)
df_surface = pd.concat([df_surface,df_product.loc[index]])
list_asin = index
df_surface = df_surface.reset_index().drop_duplicates(subset='asin')
df_surface.shape
(499, 2)
len(set(df_mac.asin.values) & set(df_surface.asin.values))
134
intersection_index = list(set(df_mac.asin.values) & set(df_surface.asin.values))
df_all = pd.concat([df_mac, df_surface])
df_all.drop_duplicates(subset='asin', inplace=True)
df_all.head()
asin | also_bought | |
---|---|---|
0 | B005CWJB5G | [B004R6J2V6, B007FL7GGS, B0077EVNY4, B008AL9VX... |
1 | B004I4BS1E | [B008AKNIQ0, B004J0DY1O, B007FL7GGS, B004J0IMY... |
2 | B0047DVRQW | [B004WI867K, B005CWIN1E, B005CWJB5G, B005CWJ8Y... |
3 | B004WI8YRC | [B007FL7GGS, B004WI867K, B00C62CHT2, B004I4BS1... |
4 | B004YLCHLQ | [B004R6J2KW, B008AL9VXI, B003XIJ3MW, B004YLCH0... |
df_mac.set_index('asin',inplace=True)
only_mac = set(df_mac.index.values) - set(intersection_index)
df_mac_only = df_mac.loc[only_mac]
# compute the number of also_bought products for each product
df_mac_only['# also_bought'] = df_mac_only.also_bought.apply(len)
# sort dataframe by # also_bought
# only keep 50 products in order to gurantee the expression effect
df_mac_new = df_mac_only.sort_values('# also_bought',ascending=False).iloc[150:200]
# drop useless column
df_mac_new.drop(['# also_bought'],axis=1,inplace=True)
df_mac_new.reset_index(inplace=True)
df_mac_new.shape
(50, 2)
df_surface.set_index('asin',inplace=True)
only_surface = set(df_surface.index.values) - set(intersection_index)
df_surface_only = df_surface.loc[only_surface]
# compute the number of also_bought products for each product
df_surface_only['# also_bought'] = df_surface_only.also_bought.apply(len)
# sort dataframe by # also_bought
# only keep 50 products in order to gurantee the expression effect
df_surface_new = df_surface_only.sort_values('# also_bought',ascending=False).iloc[150:200]
# drop useless column
df_surface_new.drop(['# also_bought'],axis=1,inplace=True)
df_surface_new.reset_index(inplace=True)
df_surface_new.shape
(50, 2)
mac_asin = df_mac_only.index.values
surface_asin = df_surface_only.index.values
def transform(x):
res = []
for asin in x:
if asin in mac_asin:
res.append('Macbook: ' + asin)
else:
res.append('Surface: ' + asin)
return res
# Add label for each asin: Macbook/Surface
df_mac_new.asin = df_mac_new.asin.apply(lambda x: 'Macbook: ' + x)
df_surface_new.asin = df_surface_new.asin.apply(lambda x: 'Surface: ' + x)
df_mac_new.also_bought = df_mac_new.also_bought.apply(lambda x: transform(x))
df_surface_new.also_bought = df_surface_new.also_bought.apply(lambda x: transform(x))
# Concatenate two sub dataframes
df_all = pd.concat([df_mac_new,df_surface_new])
df_all.drop_duplicates(subset='asin', inplace=True)
# Drop all also_bought asin which not in our asin list
df_all.set_index('asin',inplace=True)
all_index = df_all.index.values
for asin in all_index:
new_related = []
for related in df_all.loc[asin].also_bought:
if related in all_index:
new_related.append(related)
df_all.loc[asin].also_bought = new_related
df_all.reset_index(inplace=True)
df_all.shape
(100, 2)
import json
df_all.to_json('amazon.json',orient='records')
df_all.head()
also_bought | asin | |
---|---|---|
0 | [] | Macbook: B00CO8TBNS |
1 | [Macbook: B003OC6LWM, Macbook: B0081XI2Y4] | Macbook: B001W0JOUO |
2 | [Macbook: B003L4P872, Macbook: B00HHAJRU0] | Macbook: B00009EFR2 |
3 | [Macbook: B00CO8TBOW, Macbook: B00CO8TBNS, Mac... | Macbook: B007GFX0PY |
4 | [Macbook: B003XIJ566, Macbook: B004CB56XE, Mac... | Macbook: B004L9M0AO |
df_mac['# also_bought'] = df_mac.also_bought.apply(len)
# sort dataframe by # also_bought
# only keep 100 products in order to gurantee the expression effect
df_mac_new = df_mac.sort_values('# also_bought',ascending=False).iloc[100:200]
df_mac_new.drop(['# also_bought'],axis=1,inplace=True)
df_mac_new.reset_index(inplace=True)
df_surface['# also_bought'] = df_surface.also_bought.apply(len)
# sort dataframe by # also_bought
# only keep 100 products in order to gurantee the expression effect
df_surface_new = df_surface.sort_values('# also_bought',ascending=False).iloc[100:200]
df_surface_new.drop(['# also_bought'],axis=1,inplace=True)
df_surface_new.reset_index(inplace=True)
mac_asin = df_mac_only.index.values
surface_asin = df_surface_only.index.values
# rename product asin, add categorical label for each asin
df_mac_new.asin = df_mac_new.asin.apply(lambda x: 'Macbook: ' + x)
df_surface_new.asin = df_surface_new.asin.apply(lambda x: 'Surface: ' + x)
df_mac_new.also_bought = df_mac_new.also_bought.apply(lambda x: transform(x))
df_surface_new.also_bought = df_surface_new.also_bought.apply(lambda x: transform(x))
df_all_net = pd.concat([df_mac_new,df_surface_new])
df_all_net.drop_duplicates(subset='asin', inplace=True)
df_mac_net = df_mac_new[['asin']]
# set product node color
df_mac_net['group'] = 0
df_mac_net.columns = ['id','group']
df_mac_net.shape
(100, 2)
df_surface_net = df_surface_new[['asin']]
# set product node color
df_surface_net['group'] = 2
df_surface_net.columns = ['id','group']
df_surface_net.shape
(100, 2)
df_all_nodes = pd.concat([df_mac_net,df_surface_net])
df_all_nodes.to_json('node.json',orient='records')
df_all_net.set_index('asin',inplace=True)
df_link = pd.DataFrame(columns=('source','target','value'))
for asin in df_all_net.index.values:
for item in df_all_net.loc[asin].also_bought:
if asin in df_all_nodes.id.values and item in df_all_nodes.id.values:
df_link = df_link.append(pd.DataFrame([[asin, item, 1]], columns=('source','target','value')))
df_link.to_json('link.json',orient='records')
df_link.shape
(1830, 3)
Therefore, we use 200 products nodes with 1830 links for random networks.
df_all_nodes.head()
group | id | |
---|---|---|
0 | 0 | Macbook: B004QWU1H0 |
1 | 0 | Macbook: B00BGGDVOO |
2 | 0 | Macbook: B00030AXNQ |
3 | 0 | Macbook: B0006LSISO |
4 | 0 | Macbook: B00212NO6W |
df_link.head()
source | target | value | |
---|---|---|---|
0 | Macbook: B004QWU1H0 | Macbook: B007477COO | 1 |
1 | Macbook: B004QWU1H0 | Macbook: B007476KRY | 1 |
2 | Macbook: B004QWU1H0 | Macbook: B0076W9Q5A | 1 |
3 | Macbook: B004QWU1H0 | Macbook: B00AN5PUSE | 1 |
4 | Macbook: B004QWU1H0 | Macbook: B002TLTH7K | 1 |
As the volume of the dataset is very large, we can not generate the network with the help of normal python packages, for instance, NetworkX. Therefore, we decided to use D3.js
to generate animated and dynamic network graphs.
D3.js
is a JavaScript library for producing dynamic, interactive data visualizations in web browsers. It makes use of the widely implemented SVG, HTML5, and CSS standards.
Our network graphs are shown in the following link: https://xiangzhemeng.github.io/ntds/index.html