#!/usr/bin/env python # coding: utf-8 # Open In Colab #

Social network Graph Link Prediction - Facebook Challenge

# ### Problem statement: # Given a directed social graph, have to predict missing links to recommend users (Link Prediction in graph) # ### Data Overview # Taken data from facebook's recruting challenge on kaggle https://www.kaggle.com/c/FacebookRecruiting # data contains two columns source and destination eac edge in graph # - Data columns (total 2 columns): # - source_node int64 # - destination_node int64 # ### Mapping the problem into supervised learning problem: # - Generated training samples of good and bad links from given directed graph and for each link got some features like no of followers, is he followed back, page rank, katz score, adar index, some svd fetures of adj matrix, some weight features etc. and trained ml model based on these features to predict link. # - Some reference papers and videos : # - https://www.cs.cornell.edu/home/kleinber/link-pred.pdf # - https://www3.nd.edu/~dial/publications/lichtenwalter2010new.pdf # - https://kaggle2.blob.core.windows.net/forum-message-attachments/2594/supervised_link_prediction.pdf # - https://www.youtube.com/watch?v=2M77Hgy17cg # ### Business objectives and constraints: # - No low-latency requirement. # - Probability of prediction is useful to recommend ighest probability links # ### Performance metric for supervised learning: # - Both precision and recall is important so F1 score is good choice # - Confusion matrix # In[ ]: #Importing Libraries # please do go through this python notebook: import warnings warnings.filterwarnings("ignore") import csv import pandas as pd#pandas to create small dataframes import datetime #Convert to unix time import time #Convert to unix time # if numpy is not installed already : pip3 install numpy import numpy as np#Do aritmetic operations on arrays # matplotlib: used to plot graphs import matplotlib import matplotlib.pylab as plt import seaborn as sns#Plots from matplotlib import rcParams#Size of plots from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering import math import pickle import os # to install xgboost: pip3 install xgboost import xgboost as xgb import warnings import networkx as nx import pdb import pickle # In[ ]: from google.colab import drive drive.mount('/content/drive') # In[ ]: #reading graph if not os.path.isfile('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_woheader.csv'): print("true") traincsv = pd.read_csv('drive/My Drive/FacebookGraphRecomm/data/data/train.csv') print(traincsv[traincsv.isna().any(1)]) print(traincsv.info()) print("Number of diplicate entries: ",sum(traincsv.duplicated())) traincsv.to_csv('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_woheader.csv',header=False,index=False) print("saved the graph into file") else: g=nx.read_edgelist('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_woheader.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int) print(nx.info(g)) # > Displaying a sub graph # In[ ]: if not os.path.isfile('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_woheader_sample.csv'): pd.read_csv('drive/My Drive/FacebookGraphRecomm/data//data/train.csv', nrows=50).to_csv('train_woheader_sample.csv',header=False,index=False) subgraph=nx.read_edgelist('train_woheader_sample.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int) # https://stackoverflow.com/questions/9402255/drawing-a-huge-graph-with-networkx-and-matplotlib pos=nx.spring_layout(subgraph) nx.draw(subgraph,pos,node_color='#A0CBE2',edge_color='#00bb5e',width=1,edge_cmap=plt.cm.Blues,with_labels=True) plt.savefig("graph_sample.pdf") print(nx.info(subgraph)) # # 1. Exploratory Data Analysis # In[ ]: # No of Unique persons print("The number of unique persons",len(g.nodes())) # ## 1.1 No of followers for each person # In[ ]: indegree_dist = list(dict(g.in_degree()).values()) indegree_dist.sort() plt.figure(figsize=(10,6)) plt.plot(indegree_dist) plt.xlabel('Index No') plt.ylabel('No Of Followers') plt.show() # In[ ]: list(g.in_degree())[:5] # In[ ]: indegree_dist = list(dict(g.in_degree()).values()) indegree_dist.sort() plt.figure(figsize=(10,6)) plt.plot(indegree_dist[0:1500000]) plt.xlabel('Index No') plt.ylabel('No Of Followers') plt.show() # In[ ]: plt.boxplot(indegree_dist) plt.ylabel('No Of Followers') plt.show() # In[ ]: ### 90-100 percentile for i in range(0,11): print(90+i,'percentile value is',np.percentile(indegree_dist,90+i)) # 99% of data having followers of 40 only. # In[ ]: ### 99-100 percentile for i in range(10,110,10): print(99+(i/100),'percentile value is',np.percentile(indegree_dist,99+(i/100))) # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(11.7, 8.27) sns.distplot(indegree_dist, color='#16A085') plt.xlabel('PDF of Indegree') sns.despine() #plt.show() # ## 1.2 No of people each person is following # In[ ]: outdegree_dist = list(dict(g.out_degree()).values()) outdegree_dist.sort() plt.figure(figsize=(10,6)) plt.plot(outdegree_dist) plt.xlabel('Index No') plt.ylabel('No Of people each person is following') plt.show() # In[ ]: # indegree_dist = list(dict(g.in_degree()).values()) # indegree_dist.sort() plt.figure(figsize=(10,6)) plt.plot(outdegree_dist[0:1500000]) plt.xlabel('Index No') plt.ylabel('No Of people each person is following') plt.show() # In[ ]: plt.boxplot(outdegree_dist) plt.ylabel('No Of people each person is following') plt.show() # In[ ]: ### 90-100 percentile for i in range(0,11): print(90+i,'percentile value is',np.percentile(outdegree_dist,90+i)) # In[ ]: ### 99-100 percentile for i in range(10,110,10): print(99+(i/100),'percentile value is',np.percentile(outdegree_dist,99+(i/100))) # In[ ]: sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(11.7, 8.27) sns.distplot(outdegree_dist, color='#16A085') plt.xlabel('PDF of Outdegree') sns.despine() # In[ ]: print('No of persons those are not following anyone are' ,sum(np.array(outdegree_dist)==0),'and % is', sum(np.array(outdegree_dist)==0)*100/len(outdegree_dist) ) # In[ ]: print('No of persons having zero followers are' ,sum(np.array(indegree_dist)==0),'and % is', sum(np.array(indegree_dist)==0)*100/len(indegree_dist) ) # In[ ]: count=0 for i in g.nodes(): if len(list(g.predecessors(i)))==0 : if len(list(g.successors(i)))==0: count+=1 print('No of persons those are not not following anyone and also not having any followers are',count) # ## 1.3 both followers + following # In[ ]: from collections import Counter dict_in = dict(g.in_degree()) dict_out = dict(g.out_degree()) d = Counter(dict_in) + Counter(dict_out) in_out_degree = np.array(list(d.values())) # In[ ]: in_out_degree_sort = sorted(in_out_degree) plt.figure(figsize=(10,6)) plt.plot(in_out_degree_sort) plt.xlabel('Index No') plt.ylabel('No Of people each person is following + followers') plt.show() # In[ ]: in_out_degree_sort = sorted(in_out_degree) plt.figure(figsize=(10,6)) plt.plot(in_out_degree_sort[0:1500000]) plt.xlabel('Index No') plt.ylabel('No Of people each person is following + followers') plt.show() # In[ ]: ### 90-100 percentile for i in range(0,11): print(90+i,'percentile value is',np.percentile(in_out_degree_sort,90+i)) # In[ ]: ### 99-100 percentile for i in range(10,110,10): print(99+(i/100),'percentile value is',np.percentile(in_out_degree_sort,99+(i/100))) # In[ ]: print('Min of no of followers + following is',in_out_degree.min()) print(np.sum(in_out_degree==in_out_degree.min()),' persons having minimum no of followers + following') # In[ ]: print('Max of no of followers + following is',in_out_degree.max()) print(np.sum(in_out_degree==in_out_degree.max()),' persons having maximum no of followers + following') # In[ ]: print('No of persons having followers + following less than 10 are',np.sum(in_out_degree<10)) # In[ ]: print('No of weakly connected components',len(list(nx.weakly_connected_components(g)))) count=0 for i in list(nx.weakly_connected_components(g)): if len(i)==2: count+=1 print('weakly connected components wit 2 nodes',count) # # 2. Posing a problem as classification problem # In[ ]: edges = dict() edges.get() # ## 2.1 Generating some edges which are not present in graph for supervised learning # Generated Bad links from graph which are not in graph and whose shortest path is greater than 2. # In[ ]: get_ipython().run_cell_magic('time', '', "###generating bad edges from given graph\nimport random\nif not os.path.isfile('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/missing_edges_final.p'):\n #getting all set of edges\n r = csv.reader(open('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_woheader.csv','r'))\n edges = dict()\n for edge in r:\n edges[(edge[0], edge[1])] = 1\n \n \n missing_edges = set([])\n while (len(missing_edges)<9437519):\n a=random.randint(1, 1862220)\n b=random.randint(1, 1862220)\n tmp = edges.get((a,b),-1)\n if tmp == -1 and a!=b:\n try:\n if nx.shortest_path_length(g,source=a,target=b) > 2: \n\n missing_edges.add((a,b))\n else:\n continue \n except: \n missing_edges.add((a,b)) \n else:\n continue\n pickle.dump(missing_edges,open('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/missing_edges_final.p','wb'))\nelse:\n missing_edges = pickle.load(open('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/missing_edges_final.p','rb'))\n") # In[ ]: len(missing_edges) # ## 2.2 Training and Test data split: # Removed edges from Graph and used as test data and after removing used that graph for creating features for Train and test data # In[ ]: from sklearn.model_selection import train_test_split if (not os.path.isfile('data/after_eda/train_pos_after_eda.csv')) and (not os.path.isfile('data/after_eda/test_pos_after_eda.csv')): #reading total data df df_pos = pd.read_csv('data/train.csv') df_neg = pd.DataFrame(list(missing_edges), columns=['source_node', 'destination_node']) print("Number of nodes in the graph with edges", df_pos.shape[0]) print("Number of nodes in the graph without edges", df_neg.shape[0]) #Trian test split #Spiltted data into 80-20 #positive links and negative links seperatly because we need positive training data only for creating graph #and for feature generation X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(df_pos,np.ones(len(df_pos)),test_size=0.2, random_state=9) X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(df_neg,np.zeros(len(df_neg)),test_size=0.2, random_state=9) print('='*60) print("Number of nodes in the train data graph with edges", X_train_pos.shape[0],"=",y_train_pos.shape[0]) print("Number of nodes in the train data graph without edges", X_train_neg.shape[0],"=", y_train_neg.shape[0]) print('='*60) print("Number of nodes in the test data graph with edges", X_test_pos.shape[0],"=",y_test_pos.shape[0]) print("Number of nodes in the test data graph without edges", X_test_neg.shape[0],"=",y_test_neg.shape[0]) #removing header and saving X_train_pos.to_csv('data/after_eda/train_pos_after_eda.csv',header=False, index=False) X_test_pos.to_csv('data/after_eda/test_pos_after_eda.csv',header=False, index=False) X_train_neg.to_csv('data/after_eda/train_neg_after_eda.csv',header=False, index=False) X_test_neg.to_csv('data/after_eda/test_neg_after_eda.csv',header=False, index=False) else: #Graph from Traing data only del missing_edges # In[ ]: if (os.path.isfile('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_pos_after_eda.csv')) and (os.path.isfile('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/test_pos_after_eda.csv')): train_graph=nx.read_edgelist('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_pos_after_eda.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int) test_graph=nx.read_edgelist('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/test_pos_after_eda.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int) print(nx.info(train_graph)) print(nx.info(test_graph)) # finding the unique nodes in the both train and test graphs train_nodes_pos = set(train_graph.nodes()) test_nodes_pos = set(test_graph.nodes()) trY_teY = len(train_nodes_pos.intersection(test_nodes_pos)) trY_teN = len(train_nodes_pos - test_nodes_pos) teY_trN = len(test_nodes_pos - train_nodes_pos) print('no of people common in train and test -- ',trY_teY) print('no of people present in train but not present in test -- ',trY_teN) print('no of people present in test but not present in train -- ',teY_trN) print(' % of people not there in Train but exist in Test in total Test data are {} %'.format(teY_trN/len(test_nodes_pos)*100)) # In[ ]: test_new_nodes = test_nodes_pos - train_nodes_pos # In[ ]: list(test_new_nodes)[:5] # In[ ]: del test_graph # > we have a cold start problem here # In[ ]: #final train and test data sets if (not os.path.isfile('data/after_eda/train_after_eda.csv')) and \ (not os.path.isfile('data/after_eda/test_after_eda.csv')) and \ (not os.path.isfile('data/train_y.csv')) and \ (not os.path.isfile('data/test_y.csv')) and \ (os.path.isfile('data/after_eda/train_pos_after_eda.csv')) and \ (os.path.isfile('data/after_eda/test_pos_after_eda.csv')) and \ (os.path.isfile('data/after_eda/train_neg_after_eda.csv')) and \ (os.path.isfile('data/after_eda/test_neg_after_eda.csv')): X_train_pos = pd.read_csv('data/after_eda/train_pos_after_eda.csv', names=['source_node', 'destination_node']) X_test_pos = pd.read_csv('data/after_eda/test_pos_after_eda.csv', names=['source_node', 'destination_node']) X_train_neg = pd.read_csv('data/after_eda/train_neg_after_eda.csv', names=['source_node', 'destination_node']) X_test_neg = pd.read_csv('data/after_eda/test_neg_after_eda.csv', names=['source_node', 'destination_node']) print('='*60) print("Number of nodes in the train data graph with edges", X_train_pos.shape[0]) print("Number of nodes in the train data graph without edges", X_train_neg.shape[0]) print('='*60) print("Number of nodes in the test data graph with edges", X_test_pos.shape[0]) print("Number of nodes in the test data graph without edges", X_test_neg.shape[0]) X_train = X_train_pos.append(X_train_neg,ignore_index=True) y_train = np.concatenate((y_train_pos,y_train_neg)) X_test = X_test_pos.append(X_test_neg,ignore_index=True) y_test = np.concatenate((y_test_pos,y_test_neg)) X_train.to_csv('data/after_eda/train_after_eda.csv',header=False,index=False) X_test.to_csv('data/after_eda/test_after_eda.csv',header=False,index=False) pd.DataFrame(y_train.astype(int)).to_csv('data/train_y.csv',header=False,index=False) pd.DataFrame(y_test.astype(int)).to_csv('data/test_y.csv',header=False,index=False) # In[ ]: print("Data points in train data",X_train.shape) print("Data points in test data",X_test.shape) print("Shape of traget variable in train",y_train.shape) print("Shape of traget variable in test", y_test.shape) # In[ ]: import sys # These are the usual ipython objects, including this one you are creating ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars'] # Get a sorted list of the objects and their sizes sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True) # In[ ]: # computed and store the data for featurization # please check out FB_featurization.ipynb #

Social network Graph Link Prediction - Facebook Challenge

# In[ ]: #Importing Libraries # please do go through this python notebook: import warnings warnings.filterwarnings("ignore") import csv import pandas as pd#pandas to create small dataframes import datetime #Convert to unix time import time #Convert to unix time # if numpy is not installed already : pip3 install numpy import numpy as np#Do aritmetic operations on arrays # matplotlib: used to plot graphs import matplotlib import matplotlib.pylab as plt import seaborn as sns#Plots from matplotlib import rcParams#Size of plots from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering import math import pickle import os # to install xgboost: pip3 install xgboost import xgboost as xgb import warnings import networkx as nx import pdb import pickle from pandas import HDFStore,DataFrame from pandas import read_hdf from scipy.sparse.linalg import svds, eigs import gc from tqdm import tqdm # # 1. Reading Data # In[ ]: from google.colab import drive drive.mount('/content/drive') # In[ ]: if os.path.isfile('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_pos_after_eda.csv'): train_graph=nx.read_edgelist('drive/My Drive/FacebookGraphRecomm/data/data/after_eda/train_pos_after_eda.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int) print(nx.info(train_graph)) else: print("please run the FB_EDA.ipynb or download the files from drive") # # 2. Similarity measures # ## 2.1 Jaccard Distance: # http://www.statisticshowto.com/jaccard-index/ # \begin{equation} # j = \frac{|X\cap Y|}{|X \cup Y|} # \end{equation} # In[ ]: #for followees def jaccard_for_followees(a,b): try: if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0: return 0 sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\ (len(set(train_graph.successors(a)).union(set(train_graph.successors(b))))) except: return 0 return sim # In[ ]: #one test case print(jaccard_for_followees(273084,1505602)) # In[ ]: #node 1635354 not in graph print(jaccard_for_followees(273084,1505602)) # In[ ]: #for followers def jaccard_for_followers(a,b): try: if len(set(train_graph.predecessors(a))) == 0 | len(set(g.predecessors(b))) == 0: return 0 sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\ (len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b))))) return sim except: return 0 # In[ ]: print(jaccard_for_followers(273084,470294)) # In[ ]: #node 1635354 not in graph print(jaccard_for_followees(669354,1635354)) # ## 2.2 Cosine distance # \begin{equation} # CosineDistance = \frac{|X\cap Y|}{|X|\cdot|Y|} # \end{equation} # In[ ]: #for followees def cosine_for_followees(a,b): try: if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0: return 0 sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\ (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b)))))) return sim except: return 0 # In[ ]: print(cosine_for_followees(273084,1505602)) # In[ ]: print(cosine_for_followees(273084,1635354)) # In[ ]: def cosine_for_followers(a,b): try: if len(set(train_graph.predecessors(a))) == 0 | len(set(train_graph.predecessors(b))) == 0: return 0 sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\ (math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b))))) return sim except: return 0 # In[ ]: print(cosine_for_followers(2,470294)) # In[ ]: print(cosine_for_followers(669354,1635354)) # ## 3. Ranking Measures # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html # # PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. # # # # Mathematical PageRanks for a simple network, expressed as percentages. (Google uses a logarithmic scale.) Page C has a higher PageRank than Page E, even though there are fewer links to C; the one link to C comes from an important page and hence is of high value. If web surfers who start on a random page have an 85% likelihood of choosing a random link from the page they are currently visiting, and a 15% likelihood of jumping to a page chosen at random from the entire web, they will reach Page E 8.1% of the time. (The 15% likelihood of jumping to an arbitrary page corresponds to a damping factor of 85%.) Without damping, all web surfers would eventually end up on Pages A, B, or C, and all other pages would have PageRank zero. In the presence of damping, Page A effectively links to all pages in the web, even though it has no outgoing links of its own. # ## 3.1 Page Ranking # # https://en.wikipedia.org/wiki/PageRank # # In[ ]: if not os.path.isfile('data/fea_sample/page_rank.p'): pr = nx.pagerank(train_graph, alpha=0.85) pickle.dump(pr,open('data/fea_sample/page_rank.p','wb')) else: pr = pickle.load(open('data/fea_sample/page_rank.p','rb')) # In[ ]: print('min',pr[min(pr, key=pr.get)]) print('max',pr[max(pr, key=pr.get)]) print('mean',float(sum(pr.values())) / len(pr)) # In[ ]: #for imputing to nodes which are not there in Train data mean_pr = float(sum(pr.values())) / len(pr) print(mean_pr) # # 4. Other Graph Features # ## 4.1 Shortest path: # Getting Shortest path between twoo nodes, if nodes have direct path i.e directly connected then we are removing that edge and calculating path. # In[ ]: #if has direct edge then deleting that edge and calculating shortest path def compute_shortest_path_length(a,b): p=-1 try: if train_graph.has_edge(a,b): train_graph.remove_edge(a,b) p= nx.shortest_path_length(train_graph,source=a,target=b) train_graph.add_edge(a,b) else: p= nx.shortest_path_length(train_graph,source=a,target=b) return p except: return -1 # In[ ]: #testing compute_shortest_path_length(77697, 826021) # In[ ]: #testing compute_shortest_path_length(669354,1635354) # ## 4.2 Checking for same community # In[ ]: #getting weekly connected edges from graph wcc=list(nx.weakly_connected_components(train_graph)) def belongs_to_same_wcc(a,b): ''' Input two nodes : a , b . Output : Boolean (1 : They belong to same community (Weakly connected components), 0 They do not belong to same Weakly connected components) ''' index = [] if train_graph.has_edge(b,a): return 1 if train_graph.has_edge(a,b): for i in wcc: if a in i: index= i break if (b in index): train_graph.remove_edge(a,b) if compute_shortest_path_length(a,b)==-1: train_graph.add_edge(a,b) return 0 else: train_graph.add_edge(a,b) return 1 else: return 0 else: for i in wcc: if a in i: index= i break if(b in index): return 1 else: return 0 # In[ ]: belongs_to_same_wcc(861, 1659750) # In[ ]: train_graph.has_edge(861, 1659750) # In[ ]: belongs_to_same_wcc(669354,1635354) # ## 4.3 Adamic/Adar Index: # Adamic/Adar measures is defined as inverted sum of degrees of common neighbours for given two vertices. # $$A(x,y)=\sum_{u \in N(x) \cap N(y)}\frac{1}{log(|N(u)|)}$$ # In[ ]: #adar index def calc_adar_in(a,b): sum=0 try: n=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))) if len(n)!=0: for i in n: sum=sum+(1/np.log10(len(list(train_graph.predecessors(i))))) return sum else: return 0 except: return 0 # In[ ]: calc_adar_in(1,189226) # In[ ]: calc_adar_in(669354,1635354) # ## 4.4 Is persion was following back: # In[ ]: def follows_back(a,b): if train_graph.has_edge(b,a): return 1 else: return 0 # In[ ]: follows_back(1,189226) # In[ ]: follows_back(669354,1635354) # ## 4.5 Katz Centrality: # https://en.wikipedia.org/wiki/Katz_centrality # # https://www.geeksforgeeks.org/katz-centrality-centrality-measure/ # Katz centrality computes the centrality for a node # based on the centrality of its neighbors. It is a # generalization of the eigenvector centrality. The # Katz centrality for node `i` is # # $$x_i = \alpha \sum_{j} A_{ij} x_j + \beta,$$ # where `A` is the adjacency matrix of the graph G # with eigenvalues $$\lambda$$. # # The parameter $$\beta$$ controls the initial centrality and # # $$\alpha < \frac{1}{\lambda_{max}}.$$ # In[ ]: if not os.path.isfile('data/fea_sample/katz.p'): katz = nx.katz.katz_centrality(train_graph,alpha=0.005,beta=1) pickle.dump(katz,open('data/fea_sample/katz.p','wb')) else: katz = pickle.load(open('data/fea_sample/katz.p','rb')) # In[ ]: print('min',katz[min(katz, key=katz.get)]) print('max',katz[max(katz, key=katz.get)]) print('mean',float(sum(katz.values())) / len(katz)) # In[ ]: mean_katz = float(sum(katz.values())) / len(katz) print(mean_katz) # ## 4.6 Hits Score # The HITS algorithm computes two numbers for a node. Authorities estimates the node value based on the incoming links. Hubs estimates the node value based on outgoing links. # # https://en.wikipedia.org/wiki/HITS_algorithm # In[ ]: if not os.path.isfile('data/fea_sample/hits.p'): hits = nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True) pickle.dump(hits,open('data/fea_sample/hits.p','wb')) else: hits = pickle.load(open('data/fea_sample/hits.p','rb')) # In[ ]: print('min',hits[0][min(hits[0], key=hits[0].get)]) print('max',hits[0][max(hits[0], key=hits[0].get)]) print('mean',float(sum(hits[0].values())) / len(hits[0])) # ## 4.7 Calculating Preferential Attachment # In[ ]: #Preferential Attachment def calc_pref_att(a,b): try: return len(set(train_graph.predecessors(a))) * len(set(train_graph.predecessors(b))) except: return 0 # In[ ]: #testing calc_pref_att(1,189226) # ## 4.8 SVD Dot Features # # In[ ]: #svd_dot_u def svd_dot_u(node): try: s_node = node[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] d_node = node[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] return np.dot(s_node,d_node) except: return 0 # In[ ]: #svd_dot_v def svd_dot_v(node): try: s_node = node[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] d_node = node[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] return np.dot(s_node,d_node) except: return 0 # In[ ]: svd_dot_v(df_final_train.iloc[1]) # # 5. Featurization # ## 5. 1 Reading a sample of Data from both train and test # In[ ]: import random if os.path.isfile('data/after_eda/train_after_eda.csv'): filename = "data/after_eda/train_after_eda.csv" # you uncomment this line, if you dont know the lentgh of the file name # here we have hardcoded the number of lines as 15100030 # n_train = sum(1 for line in open(filename)) #number of records in file (excludes header) n_train = 15100028 s = 100000 #desired sample size skip_train = sorted(random.sample(range(1,n_train+1),n_train-s)) #https://stackoverflow.com/a/22259008/4084039 # In[ ]: if os.path.isfile('data/after_eda/train_after_eda.csv'): filename = "data/after_eda/test_after_eda.csv" # you uncomment this line, if you dont know the lentgh of the file name # here we have hardcoded the number of lines as 3775008 # n_test = sum(1 for line in open(filename)) #number of records in file (excludes header) n_test = 3775006 s = 50000 #desired sample size skip_test = sorted(random.sample(range(1,n_test+1),n_test-s)) #https://stackoverflow.com/a/22259008/4084039 # In[ ]: print("Number of rows in the train data file:", n_train) print("Number of rows we are going to elimiate in train data are",len(skip_train)) print("Number of rows in the test data file:", n_test) print("Number of rows we are going to elimiate in test data are",len(skip_test)) # In[ ]: df_final_train = pd.read_csv('data/after_eda/train_after_eda.csv', skiprows=skip_train, names=['source_node', 'destination_node']) df_final_train['indicator_link'] = pd.read_csv('data/train_y.csv', skiprows=skip_train, names=['indicator_link']) print("Our train matrix size ",df_final_train.shape) df_final_train.head(2) # In[ ]: df_final_test = pd.read_csv('data/after_eda/test_after_eda.csv', skiprows=skip_test, names=['source_node', 'destination_node']) df_final_test['indicator_link'] = pd.read_csv('data/test_y.csv', skiprows=skip_test, names=['indicator_link']) print("Our test matrix size ",df_final_test.shape) df_final_test.head(2) # ## 5.2 Adding a set of features # # __we will create these each of these features for both train and test data points__ #
    #
  1. jaccard_followers
  2. #
  3. jaccard_followees
  4. #
  5. cosine_followers
  6. #
  7. cosine_followees
  8. #
  9. num_followers_s
  10. #
  11. num_followees_s
  12. #
  13. num_followers_d
  14. #
  15. num_followees_d
  16. #
  17. inter_followers
  18. #
  19. inter_followees
  20. #
# In[ ]: if not os.path.isfile('data/fea_sample/storage_sample_stage1.h5'): #mapping jaccrd followers to train and test data df_final_train['jaccard_followers'] = df_final_train.apply(lambda row: jaccard_for_followers(row['source_node'],row['destination_node']),axis=1) df_final_test['jaccard_followers'] = df_final_test.apply(lambda row: jaccard_for_followers(row['source_node'],row['destination_node']),axis=1) #mapping jaccrd followees to train and test data df_final_train['jaccard_followees'] = df_final_train.apply(lambda row: jaccard_for_followees(row['source_node'],row['destination_node']),axis=1) df_final_test['jaccard_followees'] = df_final_test.apply(lambda row: jaccard_for_followees(row['source_node'],row['destination_node']),axis=1) #mapping jaccrd followers to train and test data df_final_train['cosine_followers'] = df_final_train.apply(lambda row: cosine_for_followers(row['source_node'],row['destination_node']),axis=1) df_final_test['cosine_followers'] = df_final_test.apply(lambda row: cosine_for_followers(row['source_node'],row['destination_node']),axis=1) #mapping jaccrd followees to train and test data df_final_train['cosine_followees'] = df_final_train.apply(lambda row: cosine_for_followees(row['source_node'],row['destination_node']),axis=1) df_final_test['cosine_followees'] = df_final_test.apply(lambda row: cosine_for_followees(row['source_node'],row['destination_node']),axis=1) # In[ ]: def compute_features_stage1(df_final): #calculating no of followers followees for source and destination #calculating intersection of followers and followees for source and destination num_followers_s=[] num_followees_s=[] num_followers_d=[] num_followees_d=[] inter_followers=[] inter_followees=[] for i,row in df_final.iterrows(): try: s1=set(train_graph.predecessors(row['source_node'])) s2=set(train_graph.successors(row['source_node'])) except: s1 = set() s2 = set() try: d1=set(train_graph.predecessors(row['destination_node'])) d2=set(train_graph.successors(row['destination_node'])) except: d1 = set() d2 = set() num_followers_s.append(len(s1)) num_followees_s.append(len(s2)) num_followers_d.append(len(d1)) num_followees_d.append(len(d2)) inter_followers.append(len(s1.intersection(d1))) inter_followees.append(len(s2.intersection(d2))) return num_followers_s, num_followers_d, num_followees_s, num_followees_d, inter_followers, inter_followees # In[ ]: if not os.path.isfile('data/fea_sample/storage_sample_stage1.h5'): df_final_train['num_followers_s'], df_final_train['num_followers_d'], \ df_final_train['num_followees_s'], df_final_train['num_followees_d'], \ df_final_train['inter_followers'], df_final_train['inter_followees']= compute_features_stage1(df_final_train) df_final_test['num_followers_s'], df_final_test['num_followers_d'], \ df_final_test['num_followees_s'], df_final_test['num_followees_d'], \ df_final_test['inter_followers'], df_final_test['inter_followees']= compute_features_stage1(df_final_test) hdf = HDFStore('data/fea_sample/storage_sample_stage1.h5') hdf.put('train_df',df_final_train, format='table', data_columns=True) hdf.put('test_df',df_final_test, format='table', data_columns=True) hdf.close() else: df_final_train = read_hdf('data/fea_sample/storage_sample_stage1.h5', 'train_df',mode='r') df_final_test = read_hdf('data/fea_sample/storage_sample_stage1.h5', 'test_df',mode='r') # ## 5.3 Adding new set of features # # __we will create these each of these features for both train and test data points__ #
    #
  1. adar index
  2. #
  3. is following back
  4. #
  5. belongs to same weakly connect components
  6. #
  7. shortest path between source and destination
  8. #
# In[ ]: if not os.path.isfile('data/fea_sample/storage_sample_stage2.h5'): #mapping adar index on train df_final_train['adar_index'] = df_final_train.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1) #mapping adar index on test df_final_test['adar_index'] = df_final_test.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1) #-------------------------------------------------------------------------------------------------------- #mapping followback or not on train df_final_train['follows_back'] = df_final_train.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1) #mapping followback or not on test df_final_test['follows_back'] = df_final_test.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1) #-------------------------------------------------------------------------------------------------------- #mapping same component of wcc or not on train df_final_train['same_comp'] = df_final_train.apply(lambda row: belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1) ##mapping same component of wcc or not on train df_final_test['same_comp'] = df_final_test.apply(lambda row: belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1) #-------------------------------------------------------------------------------------------------------- #mapping shortest path on train df_final_train['shortest_path'] = df_final_train.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1) #mapping shortest path on test df_final_test['shortest_path'] = df_final_test.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1) hdf = HDFStore('data/fea_sample/storage_sample_stage2.h5') hdf.put('train_df',df_final_train, format='table', data_columns=True) hdf.put('test_df',df_final_test, format='table', data_columns=True) hdf.close() else: df_final_train = read_hdf('data/fea_sample/storage_sample_stage2.h5', 'train_df',mode='r') df_final_test = read_hdf('data/fea_sample/storage_sample_stage2.h5', 'test_df',mode='r') # ## 5.4 Adding new set of features # # __we will create these each of these features for both train and test data points__ #
    #
  1. Weight Features # #
  2. #
  3. Page Ranking of source
  4. #
  5. Page Ranking of dest
  6. #
  7. katz of source
  8. #
  9. katz of dest
  10. #
  11. hubs of source
  12. #
  13. hubs of dest
  14. #
  15. authorities_s of source
  16. #
  17. authorities_s of dest
  18. #
# #### Weight Features # In order to determine the similarity of nodes, an edge weight value was calculated between nodes. Edge weight decreases as the neighbor count goes up. Intuitively, consider one million people following a celebrity on a social network then chances are most of them never met each other or the celebrity. On the other hand, if a user has 30 contacts in his/her social network, the chances are higher that many of them know each other. # `credit` - Graph-based Features for Supervised Link Prediction # William Cukierski, Benjamin Hamner, Bo Yang # \begin{equation} # W = \frac{1}{\sqrt{1+|X|}} # \end{equation} # it is directed graph so calculated Weighted in and Weighted out differently # In[ ]: #weight for source and destination of each link Weight_in = {} Weight_out = {} for i in tqdm(train_graph.nodes()): s1=set(train_graph.predecessors(i)) w_in = 1.0/(np.sqrt(1+len(s1))) Weight_in[i]=w_in s2=set(train_graph.successors(i)) w_out = 1.0/(np.sqrt(1+len(s2))) Weight_out[i]=w_out #for imputing with mean mean_weight_in = np.mean(list(Weight_in.values())) mean_weight_out = np.mean(list(Weight_out.values())) # In[ ]: if not os.path.isfile('data/fea_sample/storage_sample_stage3.h5'): #mapping to pandas train df_final_train['weight_in'] = df_final_train.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in)) df_final_train['weight_out'] = df_final_train.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out)) #mapping to pandas test df_final_test['weight_in'] = df_final_test.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in)) df_final_test['weight_out'] = df_final_test.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out)) #some features engineerings on the in and out weights df_final_train['weight_f1'] = df_final_train.weight_in + df_final_train.weight_out df_final_train['weight_f2'] = df_final_train.weight_in * df_final_train.weight_out df_final_train['weight_f3'] = (2*df_final_train.weight_in + 1*df_final_train.weight_out) df_final_train['weight_f4'] = (1*df_final_train.weight_in + 2*df_final_train.weight_out) #some features engineerings on the in and out weights df_final_test['weight_f1'] = df_final_test.weight_in + df_final_test.weight_out df_final_test['weight_f2'] = df_final_test.weight_in * df_final_test.weight_out df_final_test['weight_f3'] = (2*df_final_test.weight_in + 1*df_final_test.weight_out) df_final_test['weight_f4'] = (1*df_final_test.weight_in + 2*df_final_test.weight_out) # In[ ]: if not os.path.isfile('data/fea_sample/storage_sample_stage3.h5'): #page rank for source and destination in Train and Test #if anything not there in train graph then adding mean page rank df_final_train['page_rank_s'] = df_final_train.source_node.apply(lambda x:pr.get(x,mean_pr)) df_final_train['page_rank_d'] = df_final_train.destination_node.apply(lambda x:pr.get(x,mean_pr)) df_final_test['page_rank_s'] = df_final_test.source_node.apply(lambda x:pr.get(x,mean_pr)) df_final_test['page_rank_d'] = df_final_test.destination_node.apply(lambda x:pr.get(x,mean_pr)) #================================================================================ #Katz centrality score for source and destination in Train and test #if anything not there in train graph then adding mean katz score df_final_train['katz_s'] = df_final_train.source_node.apply(lambda x: katz.get(x,mean_katz)) df_final_train['katz_d'] = df_final_train.destination_node.apply(lambda x: katz.get(x,mean_katz)) df_final_test['katz_s'] = df_final_test.source_node.apply(lambda x: katz.get(x,mean_katz)) df_final_test['katz_d'] = df_final_test.destination_node.apply(lambda x: katz.get(x,mean_katz)) #================================================================================ #Hits algorithm score for source and destination in Train and test #if anything not there in train graph then adding 0 df_final_train['hubs_s'] = df_final_train.source_node.apply(lambda x: hits[0].get(x,0)) df_final_train['hubs_d'] = df_final_train.destination_node.apply(lambda x: hits[0].get(x,0)) df_final_test['hubs_s'] = df_final_test.source_node.apply(lambda x: hits[0].get(x,0)) df_final_test['hubs_d'] = df_final_test.destination_node.apply(lambda x: hits[0].get(x,0)) #================================================================================ #Hits algorithm score for source and destination in Train and Test #if anything not there in train graph then adding 0 df_final_train['authorities_s'] = df_final_train.source_node.apply(lambda x: hits[1].get(x,0)) df_final_train['authorities_d'] = df_final_train.destination_node.apply(lambda x: hits[1].get(x,0)) df_final_test['authorities_s'] = df_final_test.source_node.apply(lambda x: hits[1].get(x,0)) df_final_test['authorities_d'] = df_final_test.destination_node.apply(lambda x: hits[1].get(x,0)) #================================================================================ hdf = HDFStore('data/fea_sample/storage_sample_stage3.h5') hdf.put('train_df',df_final_train, format='table', data_columns=True) hdf.put('test_df',df_final_test, format='table', data_columns=True) hdf.close() else: df_final_train = read_hdf('data/fea_sample/storage_sample_stage3.h5', 'train_df',mode='r') df_final_test = read_hdf('data/fea_sample/storage_sample_stage3.h5', 'test_df',mode='r') # ## 5.5 Adding new set of features # # __we will create these each of these features for both train and test data points__ #
    #
  1. SVD features for both source and destination
  2. #
# In[ ]: def svd(x, S): try: z = sadj_dict[x] return S[z] except: return [0,0,0,0,0,0] # In[ ]: #for svd features to get feature vector creating a dict node val and inedx in svd vector sadj_col = sorted(train_graph.nodes()) sadj_dict = { val:idx for idx,val in enumerate(sadj_col)} # In[ ]: Adj = nx.adjacency_matrix(train_graph,nodelist=sorted(train_graph.nodes())).asfptype() # In[ ]: U, s, V = svds(Adj, k = 6) print('Adjacency matrix Shape',Adj.shape) print('U Shape',U.shape) print('V Shape',V.shape) print('s Shape',s.shape) # In[ ]: type(df_final_train) # In[ ]: if not os.path.isfile('data/fea_sample/storage_sample_stage4.h5'): #=================================================================================================== df_final_train[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \ df_final_train.source_node.apply(lambda x: svd(x, U)).apply(pd.Series) df_final_train[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \ df_final_train.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series) #=================================================================================================== df_final_train[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \ df_final_train.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series) df_final_train[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \ df_final_train.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series) #=================================================================================================== df_final_test[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \ df_final_test.source_node.apply(lambda x: svd(x, U)).apply(pd.Series) df_final_test[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \ df_final_test.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series) #=================================================================================================== df_final_test[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \ df_final_test.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series) df_final_test[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \ df_final_test.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series) #=================================================================================================== hdf = HDFStore('data/fea_sample/storage_sample_stage4.h5') hdf.put('train_df',df_final_train, format='table', data_columns=True) hdf.put('test_df',df_final_test, format='table', data_columns=True) hdf.close() # In[ ]: # prepared and stored the data from machine learning models # pelase check the FB_Models.ipynb # In[ ]: df_final_train.columns # # In[ ]: get_ipython().run_cell_magic('time', '', "s_node = df_final_train.loc[1][['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]]\n") # In[ ]: d_node = df_final_train.iloc[182][['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] # In[ ]: type(s_node) # In[ ]: s_node[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] # In[ ]: d_node[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] # In[ ]: get_ipython().run_cell_magic('time', '', 'sum_x = 0.0\nfor i in range(6):\n sum_x += s_node[i]*d_node[i]\n') # In[ ]: print(sum_x) # In[ ]: get_ipython().run_cell_magic('time', '', 'np.dot(np.array(s_node),np.array(d_node))\n') # In[ ]: get_ipython().run_cell_magic('time', '', 'np.dot(s_node,d_node)\n') # In[ ]: df_final_test.iloc[1][['source_node','destination_node']] # In[ ]: get_ipython().run_cell_magic('time', '', "df_final_train['svd_dot_u'] = df_final_train.apply(lambda row:svd_dot_u(row),axis=1)\ndf_final_train['svd_dot_v'] = df_final_train.apply(lambda row:svd_dot_v(row),axis=1)\n\n\ndf_final_test['svd_dot_u'] = df_final_test.apply(lambda row:svd_dot_u(row),axis=1)\ndf_final_test['svd_dot_v'] = df_final_test.apply(lambda row:svd_dot_v(row),axis=1)\n") # In[ ]: df_final_train['pref_att'] = df_final_train.apply(lambda row: calc_pref_att(row['source_node'],row['destination_node']),axis=1) df_final_test['pref_att'] = df_final_test.apply(lambda row: calc_pref_att(row['source_node'],row['destination_node']),axis=1) # In[ ]: df_final_train.shape # In[ ]: df_final_test.shape # In[ ]: df_final_train.iloc[1]['svd_dot_u'] # In[ ]: svd_dot_u(df_final_train.iloc[1]) #

Social network Graph Link Prediction - Facebook Challenge

# In[ ]: #Importing Libraries # please do go through this python notebook: import warnings warnings.filterwarnings("ignore") import csv import pandas as pd#pandas to create small dataframes import datetime #Convert to unix time import time #Convert to unix time # if numpy is not installed already : pip3 install numpy import numpy as np#Do aritmetic operations on arrays # matplotlib: used to plot graphs import matplotlib import matplotlib.pylab as plt import seaborn as sns#Plots from matplotlib import rcParams#Size of plots from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering import math import pickle import os # to install xgboost: pip3 install xgboost import xgboost as xgb import warnings import networkx as nx import pdb import pickle from pandas import HDFStore,DataFrame from pandas import read_hdf from scipy.sparse.linalg import svds, eigs import gc from tqdm import tqdm from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score # In[ ]: #reading from pandas import read_hdf df_final_train = read_hdf('drive/My Drive/FacebookGraphRecomm/data/data/fea_sample/storage_sample_stage4.h5', 'train_df',mode='r') df_final_test = read_hdf('drive/My Drive/FacebookGraphRecomm/data/data/fea_sample/storage_sample_stage4.h5', 'test_df',mode='r') # In[ ]: type(df_final_train) # In[ ]: df_final_train.columns # In[ ]: y_train = df_final_train.indicator_link y_test = df_final_test.indicator_link # In[ ]: df_final_train.drop(['source_node', 'destination_node','indicator_link'],axis=1,inplace=True) df_final_test.drop(['source_node', 'destination_node','indicator_link'],axis=1,inplace=True) # In[ ]: estimators = [10,50,100,250,450] train_scores = [] test_scores = [] for i in estimators: clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=5, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=52, min_samples_split=120, min_weight_fraction_leaf=0.0, n_estimators=i, n_jobs=-1,random_state=25,verbose=0,warm_start=False) clf.fit(df_final_train,y_train) train_sc = f1_score(y_train,clf.predict(df_final_train)) test_sc = f1_score(y_test,clf.predict(df_final_test)) test_scores.append(test_sc) train_scores.append(train_sc) print('Estimators = ',i,'Train Score',train_sc,'test Score',test_sc) plt.plot(estimators,train_scores,label='Train Score') plt.plot(estimators,test_scores,label='Test Score') plt.xlabel('Estimators') plt.ylabel('Score') plt.title('Estimators vs score at depth of 5') # In[ ]: depths = [3,9,11,15,20,35,50,70,130] train_scores = [] test_scores = [] for i in depths: clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=i, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=52, min_samples_split=120, min_weight_fraction_leaf=0.0, n_estimators=115, n_jobs=-1,random_state=25,verbose=0,warm_start=False) clf.fit(df_final_train,y_train) train_sc = f1_score(y_train,clf.predict(df_final_train)) test_sc = f1_score(y_test,clf.predict(df_final_test)) test_scores.append(test_sc) train_scores.append(train_sc) print('depth = ',i,'Train Score',train_sc,'test Score',test_sc) plt.plot(depths,train_scores,label='Train Score') plt.plot(depths,test_scores,label='Test Score') plt.xlabel('Depth') plt.ylabel('Score') plt.title('Depth vs score at depth of 5 at estimators = 115') plt.show() # In[ ]: from sklearn.metrics import f1_score from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint as sp_randint from scipy.stats import uniform param_dist = {"n_estimators":sp_randint(105,125), "max_depth": sp_randint(10,15), "min_samples_split": sp_randint(110,190), "min_samples_leaf": sp_randint(25,65)} clf = RandomForestClassifier(random_state=25,n_jobs=-1) rf_random = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=5,cv=10,scoring='f1',random_state=25) rf_random.fit(df_final_train,y_train) print('mean test scores',rf_random.cv_results_['mean_test_score']) print('mean train scores',rf_random.cv_results_['mean_train_score']) # In[ ]: print(rf_random.best_estimator_) # In[ ]: clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=14, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=28, min_samples_split=111, min_weight_fraction_leaf=0.0, n_estimators=121, n_jobs=-1, oob_score=False, random_state=25, verbose=0, warm_start=False) # In[ ]: clf.fit(df_final_train,y_train) y_train_pred = clf.predict(df_final_train) y_test_pred = clf.predict(df_final_test) # In[ ]: from sklearn.metrics import f1_score print('Train f1 score',f1_score(y_train,y_train_pred)) print('Test f1 score',f1_score(y_test,y_test_pred)) # In[ ]: from sklearn.metrics import confusion_matrix def plot_confusion_matrix(test_y, predict_y): C = confusion_matrix(test_y, predict_y) A =(((C.T)/(C.sum(axis=1))).T) B =(C/C.sum(axis=0)) plt.figure(figsize=(20,4)) labels = [0,1] # representing A in heatmap format cmap=sns.light_palette("blue") plt.subplot(1, 3, 1) sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.title("Confusion matrix") plt.subplot(1, 3, 2) sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.title("Precision matrix") plt.subplot(1, 3, 3) # representing B in heatmap format sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.title("Recall matrix") plt.show() # In[ ]: print('Train confusion_matrix') plot_confusion_matrix(y_train,y_train_pred) print('Test confusion_matrix') plot_confusion_matrix(y_test,y_test_pred) # In[ ]: from sklearn.metrics import roc_curve, auc fpr,tpr,ths = roc_curve(y_test,y_test_pred) auc_sc = auc(fpr, tpr) plt.plot(fpr, tpr, color='navy',label='ROC curve (area = %0.2f)' % auc_sc) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic with test data') plt.legend() plt.show() # In[ ]: features = df_final_train.columns importances = clf.feature_importances_ indices = (np.argsort(importances))[-25:] plt.figure(figsize=(10,12)) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='r', align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance') plt.show() # # Assignments: # # 1. Add another feature called Preferential Attachment with followers and followees data of vertex. you can check about Preferential Attachment in below link # http://be.amazd.com/link-prediction/
# 2. Add feature called svd_dot. you can calculate svd_dot as Dot product between sourse node svd and destination node svd features. you can read about this in below pdf # https://storage.googleapis.com/kaggle-forum-message-attachments/2594/supervised_link_prediction.pdf
# 3. Tune hyperparameters for XG boost with all these features and check the error metric. # In[ ]: df_final_train.columns # In[ ]: df_final_test.columns # In[ ]: from sklearn.metrics import f1_score from sklearn.metrics import f1_score from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint as sp_randint from scipy.stats import uniform # ### Applying XGBOOST # In[ ]: import xgboost as xgb clf = xgb.XGBClassifier() param_dist = {"n_estimators":sp_randint(105,125), "max_depth": sp_randint(2,10) } model = RandomizedSearchCV(clf, param_distributions=param_dist,n_jobs=4, n_iter=5,cv=3,scoring='f1',random_state=25,return_train_score = True) model.fit(df_final_train,y_train) print('mean test scores',model.cv_results_['mean_test_score']) print('mean train scores',model.cv_results_['mean_train_score']) # In[ ]: model.cv_results_ # In[ ]: results = pd.DataFrame.from_dict(model.cv_results_) results = results.sort_values(['param_max_depth','param_n_estimators']) train_auc =results['mean_train_score'] train_auc_std= results['std_train_score'] cv_auc = results['mean_test_score'] cv_auc_std= results['std_test_score'] results_score_sorted = results.sort_values(by=['mean_test_score'],ascending=False) results_score_sorted.head() # In[ ]: print(model.best_estimator_) # In[ ]: clf=xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7, min_child_weight=1, missing=None, n_estimators=117, n_jobs=4, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) # In[ ]: clf.fit(df_final_train,y_train) y_train_pred = clf.predict(df_final_train) y_test_pred = clf.predict(df_final_test) # In[ ]: from sklearn.metrics import f1_score print('Train f1 score',f1_score(y_train,y_train_pred)) print('Test f1 score',f1_score(y_test,y_test_pred)) # In[ ]: print('Train confusion_matrix') plot_confusion_matrix(y_train,y_train_pred) print('Test confusion_matrix') plot_confusion_matrix(y_test,y_test_pred) # In[ ]: from sklearn.metrics import roc_curve, auc fpr,tpr,ths = roc_curve(y_test,y_test_pred) auc_sc = auc(fpr, tpr) plt.plot(fpr, tpr, color='navy',label='ROC curve (area = %0.2f)' % auc_sc) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic with test data') plt.legend() plt.show() # In[ ]: features = df_final_train.columns importances = clf.feature_importances_ indices = (np.argsort(importances))[-25:] plt.figure(figsize=(10,12)) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='r', align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance') plt.show() # In[ ]: # Please compare all your models using Prettytable library # http://zetcode.com/python/prettytable/ from prettytable import PrettyTable RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=14, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=28, min_samples_split=111, min_weight_fraction_leaf=0.0, n_estimators=121, n_jobs=-1, oob_score=False, random_state=25, verbose=0, warm_start=False) #If you get a ModuleNotFoundError error , install prettytable using: pip3 install prettytable x = PrettyTable() x.field_names = ["Vectorizer", "Model", "Hyper Parameter", "F1-Score"] x.add_row(["Previous Graph Based features", "Random Forest", "Max Depth:14 , Estimators : 111, min_samples_leaf:28, min_samples_split:111", 0.92]) x.add_row(["Previous Graph Based features + Two new features", "XGBoost", "Max Depth:7 , Estimators : 117", 0.93]) print(x) # ## Observations: # # 1. XGBoost also performs very similar to Random Forest . # 2. Two new added features Preferntial attachment and svd_dot are also not very important as per XGBoost model , hence not much improvement in results .