#!/usr/bin/env python # coding: utf-8 # # Introduction to Network Analysis with NetworkX # # *by Georgy Lazarev* **(mlcourse slack: jorgy)** # Systems are ubiquitous these days in various fields of science, industry and daily-life situations. Majority of them (to name few, social interactions, telephone and transport networks) can be represented as networks or graphs (sets of nodes and edges where nodes characterize certain objects and edges indicate some kind of connection between them) and therefore are suitable for analysis. # # We might want to analyze relationships between participants or actor in those systems to get some valuable insights: # # - Which nodes are more important (influencers in network) # - Pathfinding – determining shortest paths between certain nodes # - Structure - finding cliques, community detection (we'll discuss those terms later) # # In this tutorial I'll introduce you to the NetworkX - Python package for 'the creation, manipulation, and study of the structure, dynamics, and functions of complex networks'. Important properties pointed out by developers: # - provides an interface to existing numerical algorithms and code written in C, C++, and FORTRAN, thus resulting in promising speed perfomance # - the ability to painlessly work with large nonstandard data sets. # # Let's start with some basic things. At first you need to install NetworkX (*'pip install networkx'* or *'conda install ..'* if you are working under Anaconda) # In[45]: #importing necessary packages import networkx as nx import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import csv import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') from collections import defaultdict import warnings warnings.filterwarnings('ignore') # In[46]: #creating empty Graph G = nx.Graph() #you can now add nodes and edges G.add_node('a') G.add_node(1) G.add_edge('a',1) #or creating list of nodes/edges in advance G.add_nodes_from([2,3,4]) G.add_edges_from([(2,4),('a',3)]) # In[47]: #setting attribute while creating node G.add_node('John', age=26) #setting attribute to already existing node G.node['John']['hobby']='writing music' nx.set_node_attributes(G,'number',7) #graph with random weghts weights={} for e in G.edges(): G[e[0]][e[1]]['weight']=np.random.random() # In[48]: print(G.edges()) print(G.edges(data=True)) #data=True prints attributes too # In[49]: display(type(G)) # In[50]: print(nx.info(G)) # In[51]: nx.draw(G,with_labels=True,node_size=900) # It's probably worth mentioning that calling .draw without specifying *pos* argument will result in different layouts of nodes. # ## Analysis of real graph # [Here](http://konect.uni-koblenz.de/networks/) you can find real networks stored as graph representations (edgelist mostly) in .tsv format. For simplicity I picked # quite small undirected [graph](https://www.dropbox.com/s/4n86gethjw98gqk/out.dolphins?dl=0), which represent frequent social interactions between 62 dolphins from community living off Doubtful Sound, # a fjord in New Zealand. So this is sort of social network. # # Reading graphs from files doesn't have unified way as data can be stored in different formats and graphs can be represented differently (adjacency matrix, edge lists, adjacency list). I'd recommend you go for reference [here](https://networkx.github.io/documentation/networkx-1.10/reference/convert.html) and [here](https://networkx.github.io/documentation/stable/reference/readwrite/index.html) # In[52]: edges = [] with open('datasets/out.dolphins', 'r') as f: filereader = csv.reader(f, delimiter="\t", quotechar='"') next(filereader) # we'll skip header row for row in filereader: edges.append(row[:2]) edges[:5] # In[53]: #now create graph from edgelist we made in previous step GD = nx.from_edgelist(edges) # In[54]: print(nx.info(GD)) # In[55]: nx.draw(GD,with_labels=True) # There are other types of vizualization except usual one (especially useful when you are dealing with large graphs) - ArcPlot, MatrixPlot and etc. I'll show only one here - CircosPlot # Let's install and import nice visualization package for NetworkX - [nxviz](https://github.com/ericmjl/nxviz). # In[56]: import nxviz as nv # In[57]: c=nv.CircosPlot(GD,node_labels=True) c.draw() # That was fast :-) Now let's dive into analysis of our dolphins social graph # In[58]: #.has_edge method allows us to check presence of direct interaction between two nodes GD.has_edge('40','37') # In[59]: print('shortest path from node 12 to 5 is of length', nx.shortest_path_length(GD,'12','5'),":", nx.shortest_path(GD,'12','5')) # For every vertex in graph we can calculate its degree – number of adjacent (connected by an edge )vertices. # In[60]: display(list(GD.neighbors('9'))) print('node 9 has %d neighbors' %GD.degree()['9']) # In[61]: GD.degree() #returns dictionary with all nodes # In[62]: # find 5 nodes with largest amount of neighbors sorted(dict(GD.degree()).items(), key=lambda x:x[1], reverse=True)[:5] # That brings us to several types of centrality - metric for describing importance of a node # ### Degree centrality # $$degree \ centrality = \frac {degree\ of\ node} {number\ of \ nodes\ in \ network}$$ # In[63]: #We create dictionary with nodes and their degree centralities degcent=nx.degree_centrality(GD) #for getting dictionary with nodes and #their degree centralities as keys and values respectively print("Networkx degree centrality for node 16:",degcent['16']) # Let's see 5 nodes with largest degree centrality: # In[64]: sorted(degcent.items(), key=lambda x: x[1], reverse=True)[:5] # In[65]: #we'll store node with the largest degree centraliy as 'nld ' nld=sorted(degcent.items(), key=lambda x: x[1], reverse=True)[:1][0][0] # Here goes sort of vizualization of this node: # In[66]: palette = sns.color_palette('coolwarm',2).as_hex() palette # In[67]: #this list wi'll be used in .draw call. node_colors=[palette[0] if n!=nld else palette[1] for n in GD.nodes() ] # In[68]: nx.draw(GD,node_color=node_colors,with_labels=True) # As mentioned above you can set attributes to nodes. Let's say we want each node to keep its degree as (one of) attribute: # In[69]: for n in GD.nodes(): GD.node[n]['degree centrality'] = degcent[n] # Now we can use it in customizing our vizualization: # In[70]: c=nv.CircosPlot(GD,node_order='degree centrality',node_labels=True) c.draw() # ### Closeness centrality # $$closeness \ centrality = \frac {N\ -\ 1} {sum\ of \ distances\ to \ each \ node \ from \ current \ one}$$ # # where $N =$ number of nodes # Compares how many steps (edges) it would take to reach every other node in a network (using shortest paths obviously). # Intuitively you can think of this as of 'average distance' to all other nodes. # # Degree centrality takes into account only direct connections thus deluding analyst sometimes as current node can be central only in local neighborhood. That's why closeness centrality is important. # # In case pf being not fully connected, this algorithm computes the closeness centrality for each connected component separately. # By the way, you can see connected parts of your graph this way (in our case graph is itself connected): # In[71]: for comp in nx.connected_components(GD): print(comp) # In[72]: #calculating closeness centrality for each node clocent = nx.closeness_centrality(GD) # In[73]: sorted(clocent.items(), key=lambda x: x[1], reverse=True)[:5] # In[74]: for n in GD.nodes(): GD.node[n]['closeness centrality'] = clocent[n] # In[75]: nlcc=sorted(clocent.items(), key=lambda x: x[1], reverse=True)[:1][0][0] palette = sns.color_palette('coolwarm',2).as_hex() node_colors=[palette[0] if n!=nlcc else palette[1] for n in GD.nodes() ] # In[76]: nx.draw(GD,node_color=node_colors,with_labels=True) # ### Betweeness Centrality # We can calculate shortest path for each pair of node (if they belong to one connected component obviously). # So betweeness centrality of a node indicates to how often this current node becomes a part of all shortest paths. # $$betweeness \ centrality = \frac {number \ of \ shortest \ paths \ passing \ throught \ current \ node } {totul \ number \ of \ shortest \ paths}$$ # Intuitively betweeness centrality can be thought of as measure of node influence in network. For example, members of a group with 'big' influence can be useful in terms of spreading a message/information. As an other example - in telecommunication network a node with largest betweeness centrality has biggest control due to amount information transferring through it. # In[77]: betcent = nx.betweenness_centrality(GD) for n in GD.nodes(): GD.node[n]['betweeness centrality'] = betcent[n] sorted(betcent.items(), key=lambda x: x[1], reverse=True)[:5] # In[78]: nlbc=sorted(clocent.items(), key=lambda x: x[1], reverse=True)[:1][0][0] palette = sns.color_palette('coolwarm',2).as_hex() node_colors=[palette[0] if n!=nlbc else palette[1] for n in GD.nodes() ] # In[79]: nx.draw(GD,node_color=node_colors,with_labels=True) # $Betweeness \ centrality \ for \ edges$ # # I guess, this term doesn't need an explanation as its meaning is almost the same as for node # In[80]: betcent_e=nx.edge_betweenness_centrality(GD) sorted(betcent_e.items(), key=lambda x: x[1], reverse=True)[:5] # In[81]: for e in GD.edges(): GD[e[0]][e[1]]['betweeness centrality']=betcent_e[e] # In[82]: #the bigger is betweeness centrality - the darker are edges c=nv.CircosPlot(GD,edge_color='betweeness centrality') c.draw() # In[83]: elbc=sorted(betcent_e.items(), key=lambda x: x[1], reverse=True)[:1][0][0] palette=['#bd3c14','#009688'] edge_colors=[palette[0] if e!=elbc else palette[1] for e in GD.edges() ] # In[84]: nx.draw(GD,edge_color=edge_colors,with_labels=True,node_color='#999999') # The green edge is the one with the largest betweeness centrality. Colors are quite distinguishable, I hope :-) # ### Eigenvector Centrality # OK, I tried to avoid this but here is real formula without words: # # $$Ax = \lambda x$$ # # A is adjacency matrix of graph, (one of the (if not the one) most-used representation of graph) - NxN matrix where values in intersections of columns and rows corresponds to connection between respective nodes. Non-zero values stand for connection between nodes (in case of unweighted graph 1 is the only option) and zero does mean nodes are not directly connected. # $\lambda$ is egenvalue of A and the largest eigenvalue associated with the eigenvector of the adjacency matrix A # # Importance of a node depends on the importance of its neighbors. # # In[85]: eigcent = nx.eigenvector_centrality_numpy(GD) sorted(eigcent.items(), key=lambda x: x[1], reverse=True)[:5] # In[86]: for n in GD.nodes(): GD.node[n]['eigenvector centrality'] = eigcent[n] # In[87]: #you can use this function for colouring nodes according to their values of given attribute def attribute_color(G,attribute): attrs=[G.node[n][attribute] for n in G.nodes()] uattrs=sorted(list(set(attrs))) palette = sns.color_palette('Blues',len(uattrs)).as_hex() colmap=dict(zip(uattrs,palette)) node_colors=[colmap[at] for at in attrs] nx.draw(G,node_color=node_colors,with_labels=True) # Let's see all four types of centrality in one picture. Darker shades of blue corresponds to bigger values of metric # In[88]: fig = plt.figure(figsize=(12, 12)) plt.subplot(221) attribute_color(GD,'degree centrality') plt.subplot(222) attribute_color(GD,'closeness centrality') plt.subplot(223) attribute_color(GD,'betweeness centrality') plt.subplot(224) attribute_color(GD,'eigenvector centrality') # ## Link Prediction # If we have infromation about social network at this given moment we might want to try to predict which new interaction are more likely to happen in future. # # ### Jaccard coefficient # # To find pair of nodes which are 'similar' to each other we can use the Jaccard coefficient # It measures what proportion of neighbors a pair of nodes share. # $$Jaccard _{uv} = \frac { |N_u \cap N_v |} {|N_u\ \cup \ N_v|}$$ # # where $N_u$ - set of neighbors of node $u$ # In[89]: jc = nx.jaccard_coefficient(GD) jcd = {} for u, v, p in jc: jcd[(u,v)] = p # In[90]: len(jcd) # How much pair of nodes have a probablity higher than 0.5 to connect in nearest future? # In[91]: len([(u,v) for u,v in jcd.keys() if jcd[(u,v)]>0.5]) # In[92]: #let's see those pairs [(u,v) for u,v in jcd.keys() if jcd[(u,v)]>0.5] # In[93]: #let's vizualize nodes that are most likely to interact the same way as with nodes before ejc=sorted(jcd.items(), key=lambda x: x[1], reverse=True)[:1][0][0] palette = sns.color_palette('coolwarm',2).as_hex() node_colors=[palette[0] if n not in ejc else palette[1] for n in GD.nodes() ] # In[94]: nx.draw(GD,node_color=node_colors,with_labels=True) # ### Preferential Attachment # # In the other approach to link prediction nodes with high degree will be the ones to be more likely to get future connections. # $$# PA _{uv} = { |N_u \cap N_v |} \ or \ {deg_u * deg_v} #$$ # In[95]: pa = nx.preferential_attachment(GD) pad = {} for u, v, p in pa: pad[(u,v)] = p # In[96]: sorted(pad.items(), key=lambda x:x[1], reverse=True)[:5] # You can scale it though :-) # In[97]: epa=sorted(pad.items(), key=lambda x: x[1], reverse=True)[:1][0][0] palette = sns.color_palette('coolwarm',2).as_hex() node_colors=[palette[0] if n not in epa else palette[1] for n in GD.nodes() ] # In[98]: nx.draw(GD,node_color=node_colors,with_labels=True) # ### Resource Allocation # # $$RA _{uv} = \sum_{w\in N_u \cap N_v} { \frac { 1} {deg_w } }$$ # # Inspiration for proposing this similarity measure is following: we consider two nodes $u$ and $v$, where $u$ can allocate resources to $v$ (as well as the other way around), through their common neighbors (transmitters). We assume that each node has one resource only which it assigns to its neighbors evenly. Thus the expression above is used for calculating amount of resources node can get from the other one. # There is one very similar method proposed for measuring probability of interaction - *Adamic/Adar (AA)*. The difference is that the latter one has logarithm in denominator. So RA penalizes high-degree neigbours more. # # [Here](https://networkx.github.io/documentation/stable/reference/algorithms/link_prediction.html) you can find all implemented similarity indexes in NetworkX. # In[99]: ra = nx.resource_allocation_index(GD) rad = {} for u, v, p in ra: rad[(u,v)] = p # In[100]: sorted(rad.items(), key=lambda x:x[1], reverse=True)[:5] # In[101]: era=sorted(rad.items(), key=lambda x: x[1], reverse=True)[:1][0][0] palette = sns.color_palette('coolwarm',2).as_hex() node_colors=[palette[0] if n not in era else palette[1] for n in GD.nodes() ] # In[102]: nx.draw(GD,node_color=node_colors,with_labels=True,node_size=200) # ## Structure analysis # # ### Cliques # # Cliques in network analysis - completely connected graphs (or subset of nodes). Maximal cliques is such cliques that stop being cliques with addition of any single edge. # In[103]: len(list(nx.find_cliques(GD))) # In[104]: list(nx.find_cliques(GD))[:5] # Yes, edge is also under a definition of clique. # And just so you knew this algorithm is not suitable for directed graph (as [documentation](https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.algorithms.clique.find_cliques.html) states) # In[105]: from networkx.algorithms.community import k_clique_communities # *k_clique_communities* return a list of groups, each produced by combining all k-size cliques sharing $k-1$ nodes. This represents the concept of *clique percolation method* that allows for discovering overlapping structure of network by detecting fully-connected subgraphs of $k$ nodes. Two $k$ - cliques are considered adjacent if they share $k$ neighbours. # In[106]: list(k_clique_communities(GD, 3, cliques=None)) # ### Community Detection # # Different groups of nodes seem to be more densely connected internally rather than externally, which makes them called “communities”. In the past decade the problem of graph partitioning into number of groups attracted a lot of interest from physics and statisticians. This field of study, commonly called ‘community detection’, now includes growing number of papers suggesting new algorithms and methods and modifications of existing ones. The importance of community detection is quite obvious as it allows for analyzing the structure of networks, to make visualization more clear, to make certain conclusions or predictions based on analysis. To note, before detection a number of communities is uncertain, which makes the difference between community detection and clustering, where an amount of clusters is given intentionally. Moreover, communities can differ by size and density, can incorporate hierarchical structure. # So, networkX package for community detection has an algorithm based on [this paper](https://arxiv.org/pdf/cond-mat/0112110.pdf). # In[107]: from networkx.algorithms.community import girvan_newman # The idea of Girvan Newmann algorithm is that at each step edges with highest betweeness are removed thus separating graph into densely connected parts. # In[108]: gn = girvan_newman(GD) #returns an iterator over tuples of communities. # So each tuple is a partition of a graph at current level of algorithm. At the last iteration all nodes themselves are communities: # In[109]: len(list(gn)[-1]) #as you might remember there are 62 nodes in our network # Let's see a state of partition at the second iteration: # In[110]: gn = girvan_newman(GD) next(gn) second_iteration= tuple(sorted(c) for c in next(gn)) # In[111]: print(second_iteration) # In[112]: from seaborn import color_palette # With this function you can make coloured vizualization of communities: # In[113]: def colour_communities(G,partition): commap={} for n in G.nodes(): for i, c in enumerate(partition): if n in c: commap[n] = i G.node[n]['community']=i cs=[G.node[n]['community'] for n in G.nodes()] ucs=list(set(cs)) palette = color_palette('coolwarm',len(ucs)).as_hex() colmap=dict(zip(ucs,palette)) node_colors=[colmap[c] for c in cs] return node_colors, colmap, palette # In[114]: #store partitions at first four iterations gn = girvan_newman(GD) first_iteration= tuple(sorted(c) for c in next(gn)) second_iteration=tuple(sorted(c) for c in next(gn)) third_iteration=tuple(sorted(c) for c in next(gn)) fourth_iteration=tuple(sorted(c) for c in next(gn)) # In[115]: [len(part) for part in list(first_iteration)] # First iteration - two communities consisting of 41 and 21 nodes. # In[116]: display([len(part) for part in list(second_iteration)]) display([len(part) for part in list(third_iteration)]) display([len(part) for part in list(fourth_iteration)]) # Now pictures! # In[117]: fig = plt.figure(figsize=(20, 15)) plt.subplot(221) node_colors, color_map, palette = colour_communities(GD, first_iteration) nx.draw(GD, node_color=node_colors, with_labels=True) plt.subplot(222) node_colors, color_map, palette = colour_communities(GD, second_iteration) nx.draw(GD, node_color=node_colors, with_labels=True) plt.subplot(223) node_colors, color_map, palette = colour_communities(GD, third_iteration) nx.draw(GD, node_color=node_colors, with_labels=True) plt.subplot(224) node_colors, color_map, palette = colour_communities(GD, fourth_iteration) nx.draw(GD, node_color=node_colors, with_labels=True) # ### Subgraph # # When you have a large graph but want to vizualize or analyze only a small portion of it it can be helpful to extract nodes of interest and corresponding edges. Thats what subgraph is about. # # For example, let's construct a subgraph based on node with the largest degree and its neigbours # In[118]: sorted(dict(GD.degree()).items(), key=lambda x:x[1], reverse=True)[:1] # In[119]: nbrs=list(GD.neighbors('15')) nbrs # In[120]: #don't forget to include your node of interest nbrs.append('15') # In[121]: GD_sub15=GD.subgraph(nbrs) print(nx.info(GD_sub15)) # In[122]: nx.draw(GD_sub15,with_labels=True,alpha=0.7,node_size=500) # ## Bipartite graphs # Bipartite graphs - graphs such that: # - partitioned into two sets (each node belong to one of the two groups) # - node can be connected only to node from other set # # # As an example of bipartite graph you can imagine a graph consisting of two sets: customers and products they purchase, website users and movies they leave reviews to. # In networkX bipartite concept can be implemented by keyword *bipartite* when specifying metadata for nodes (or by any attribute name you want). # Now I took [graph](https://www.dropbox.com/s/mnxzbndbxq1dif5/out.brunson_club-membership_club-membership?dl=0) from the [same source](http://konect.uni-koblenz.de/networks/). It's also real but still toy (in a way) example. Network consists of 40 nodes - 25 corporate executive officers and 15 social organizations they are members of. And each edge (in the amount of 95) tells that current person has a membership status in corresponding club. # In[123]: edges = [] with open('datasets/out.brunson_club-membership_club-membership', 'r') as f: filereader = csv.reader(f, delimiter=" ", quotechar='"') next(filereader) # skips header row next(filereader) for row in filereader: edges.append(row[:2]) # In[124]: # in original format each set of nodes begins from 1.. so for the sake of distinguishability this is what I came to for edge in edges: edge[0]='m'+edge[0] edge[1]='c'+edge[1] # In[125]: edges[:5] # In[126]: CM=nx.Graph() # In[127]: clubs=['c'+str(i+1) for i in range(15)] members=['m'+str(i+1) for i in range(25)] # In[128]: CM.add_nodes_from(clubs,bipartite='clubs') CM.add_nodes_from(members,bipartite='members') CM.add_edges_from(edges) # In[129]: top = nx.bipartite.sets(CM)[0] pos = nx.bipartite_layout(CM, top,scale=7) # In[130]: print(nx.info(CM)) # Bipartite graphs can be visualized the following way: # In[131]: fig = plt.figure(figsize=(7, 7)) nx.draw(CM,pos,node_color='#9699bc',alpha=0.7,with_labels=True,aspect_ratio=0.1,node_size=450) # Now it'a time to show application (kind of) of biparite graph - recommendations. As well as graph our task will be relatively toy. Let's try to recommend to person a organization to join based on similar persons. (similarity between persons is measured based on organization they have membership of. # In[132]: #function that produces the set of clubs both persons are going to def shared_partition(G,mbr1,mbr2): nbrs1 = G.neighbors(mbr1) nbrs2 = G.neighbors(mbr2) overlap = set(nbrs1).intersection(nbrs2) return overlap def member_similarity(G, mbr1, mbr2): shared_nodes = shared_partition(G,mbr1,mbr2) return len(shared_nodes) / len(clubs) # In[133]: shared_partition(CM,'m3','m5') # In[134]: member_similarity(CM,'m23','m13') # In[135]: def most_similar_members(G, member,thr=0.14): # you can specify the threshold mbrs=members.copy() mbrs.remove(member) similarities = defaultdict(list) for m in mbrs: similarity = member_similarity(G, member, m) similarities[similarity].append(m) max_similarity = max(similarities.keys()) if max_similarity>=thr: return [similarities[max_similarity]] else: return [[]] # In[136]: most_similar_members(CM, 'm15') # In[137]: def recommend_club(G,member,thr=0.14): smlr=most_similar_members(G, member,thr) tmp=[list(G.neighbors(n)) for n in smlr[0]] # we make a list of neighbours lists for every node in list of similar members smlr_clubs = set([i for sub in tmp for i in sub]) # ..and then flatten it mbr_clubs = set(G.neighbors(member)) return list(smlr_clubs.difference(mbr_clubs)) print(recommend_club(CM,'m15')) # In[138]: #store recommendations for every member recommendations={} for mbr in members: recommendations[mbr]=[recommend_club(CM,mbr)] # In[139]: recommendations # As I said, it is a very toy example, that's why default threshold (0.14) is so small. But I guess, you've grasphed an idea of what can be done. Also there are several papers dedicated to link prediction in bipartite graphs but that's already beyond the scope of this tutorial # That's all :-) I hope you've learned something new from this. # NetworkX is a set of various tools for study of the network structure and dynamics which you can use for your needs to find useful insight in your data. Network analysis,in turn, is quite an useful approach to analyze systems and can be implemented in biology, social science, logistics. Moreover, community detection has been there for less than 20 years and new algorithms/methods are proposed nowadays. # Here you can find something useful: # # - [course in DataCamp](https://www.datacamp.com/courses/network-analysis-in-python-part-1) # - [video of presentation at PyData Carolinas 2016](https://www.youtube.com/watch?v=7fsreJMy_pI) # - [NetworkX documentation](https://networkx.github.io/documentation/stable/reference/index.html) # - [the very first paper dedicated to *community detection* I believe](https://arxiv.org/pdf/cond-mat/0112110.pdf) # - [another one about community detection but monography](https://arxiv.org/pdf/0906.0612.pdf) # - [collection of networks, once again](http://konect.uni-koblenz.de/networks/)