#!/usr/bin/env python
# coding: utf-8
#
Social Network Analysis
#
#
# The goal of this byte is to explore some algorithms and visualizations relating to networks
# K-Core Algorithm Introduction:
# ============
# * K-Core is an approach of simplifying a graph by removing the edges that have small degrees. The goal of the algorithm is to find groups of nodes that are all connected to at least k other people in the same group
# * For more information, you can read this paper http://arxiv.org/pdf/cs/0504107v2.pdf
#
# Algorithm:
# -------------
# 1. Delete all the nodes and corresppoding edges that have degrees less than k
# 2. Calculate the degrees of all the remaining nodes.
# 3. If the degrees of all the nodes are larger than k or equal to k, return; Otherwise, repeat from step 1
# In[1]:
import copy
# open the file you have downloaded
# these files are organized
file = open("amazon.txt")
# this returns an array with one entry for each line ni the file
lines = file.readlines()
print len(lines)
# Note: the format of the snap files is to list a node (identified by a unique number)
# and all of the nodes it links to (also identified by numbers), on the same line, separated by tabs.
# In[2]:
# construct the graph
# a set is an unordered collection of unique elements
edges = set()
# this will store our nodes
nodes = {}
# divide the line into the node and all of its edges
# for each line in the file that was loaded in
for line in lines:
# divide the line into the node and all of its edges
data = line.split()
a = int(data[0])
b = int(data[1])
# add the edge
edges.add((a, b))
# update the count for the number of times we've seen each node
nodes[a] = nodes.get(a, -1) + 1
nodes[b] = nodes.get(b, -1) + 1
print "number of unique edges"
print len(edges)
print "number of unique nodes"
print len(nodes)
# In[3]:
# get the degrees of each node in a set of edges
def get_degrees(edges):
degree_counts={}
# for each pair of nodes (edge)
for i,j in edges:
# increment the count for the number of edges connected
# to each node by one
degree_counts[i] = degree_counts.get(i, 0) + 1
degree_counts[j] = degree_counts.get(j, 0) + 1
return degree_counts
# Delete all nodes in delete_nodes from edges
def delete_node(edges, delete_nodes):
# construct a new set of edges
new_edges = []
print "# of nodes to be deleted", len(delete_nodes)
# loop through all the current edges
for i, j in edges:
# if an edges two nodes are not in the
# set of nodes to be deleted
if i not in delete_nodes and j not in delete_nodes:
# append that edge to our new edges
new_edges.append((i,j))
return new_edges
# kcore algorithm
# We run the kcore algorithm to delete all
# the nodes whose cores are less than k
# returns a new set of edges and nodes
# including only those in the k core.
def kcore(edges, k):
# make a complete copy of the edges so we can delete or change
# things without messing up our original
edges = copy.deepcopy(edges)
# now for each pair of nodes, count the number of
degree_counts = get_degrees(edges)
# sort the nodes by degree and return
# only the node numbers (not their degree)
sorted_nodes = sorted(degree_counts, key = degree_counts.get)
print "largest degree: ", degree_counts[sorted_nodes[0]]
# repeatedly delete all nodes with degrees < k to find the k core
# if we run out of nodes, or the largest count is < k we should stop
while ((len(sorted_nodes) > 0) and (degree_counts[sorted_nodes[0]]