%matplotlib inline import json import pandas as pd import numpy as np import networkx as nx import requests from pattern import web import matplotlib.pyplot as plt from operator import itemgetter # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'lightgray' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() """ Function -------- get_senate_vote Scrapes a single JSON page for a particular Senate vote, given by the vote number Parameters ---------- vote : int The vote number to fetch Returns ------- vote : dict The JSON-decoded dictionary for that vote Examples -------- >>> get_senate_vote(11)['bill'] {u'congress': 113, u'number': 325, u'title': u'A bill to ensure the complete and timely payment of the obligations of the United States Government until May 19, 2013, and for other purposes.', u'type': u'hr'} """ #your code here def get_senate_vote(vote): url = ''.join(["https://www.govtrack.us/data/congress/113/votes/2013/s",vote,"/data.json"]) data = requests.get(url).json() return data """ Function -------- get_all_votes Scrapes all the Senate votes from http://www.govtrack.us/data/congress/113/votes/2013, and returns a list of dicts Parameters ----------- None Returns -------- votes : list of dicts List of JSON-parsed dicts for each senate vote """ #Your code here ## Q: Why use BeautifulSoup? ## A: pattern.web's documentation is awful. ## Even though it says its implementation is on top of BS, it doesn't behave perfectly. ## BS's 'findAll' method is great here, so i switched. from BeautifulSoup import BeautifulSoup import re def get_all_votes(): link = "https://www.govtrack.us/data/congress/113/votes/2013/" data = requests.get(link).text soup = BeautifulSoup(data) ## Q: what does the regex mean? ## A: 's' and '/' are string literals ## \d is shorthand for [0-9] ## {1,4} means we need at least one digit, and up to 4 ## (i'm pretty sure there aren't more than 999 votes, but definitely not more than 9999 votes!) pattern = re.compile("s\d{1,4}/") ## search for href tags with the above pattern senate_votes = soup.findAll(href=pattern) ## Q: why [1:-1] in vote['href']? ## A: first character (index 0) is 's', last character (index -1) is '/' ## so [1:-1] to extract just the number, which is what we need to pass to get_senate_vote() all_senate_votes = [] all_folders = [] votes = [ int(vote['href'][1:-1]) for vote in senate_votes ] votes = np.sort(votes) ## for the first day i worked on this function, i had a hell of a time getting all the way through ## without the site crashing on me. since then, it hasn't been a problem, although my code didn't change at all. ## anyway, the missed[] array here is for catching errors, just in case the problem happens again. missed = [] for vote in votes: try: all_senate_votes.append(get_senate_vote(str(vote))) except: missed.append(vote) return all_senate_votes vote_data = get_all_votes() def get_senator_display_names(data): info = set() for vote in data: for ix in xrange(4): for vote_type in vote['votes']: if (vote_type == 'Yea') or (vote_type == 'Nay'): for datum in vote['votes'][vote_type]: senator_info = datum['display_name'] info.add(senator_info) ## get unique list of senator names return info ## test ## info = get_senator_display_names(vote_data) ## print list(info)[:2] def fill_sen_df(info): ## set up df for all senator info (including vote agreement counts) ## it's a 104 row X 108 column matrix ## (the first 4 columns are string data about each senator) cols = ['display','surname','party','color','state'] cols.extend(list(info)) sen_df = pd.DataFrame(index=info, columns = cols, data = np.zeros((len(info),len(info)+5))) ## change the first few columns to string dtype sen_df[['display','surname', 'party', 'color', 'state']] = sen_df[['display','surname', 'party', 'color', 'state']].astype(str) for senator in info: sen_df.loc[senator, 'display'] = senator surname = senator[:-7] sen_df.loc[senator,'surname'] = surname affiliation = re.search("\(([RDI]{1})", senator).groups()[0] sen_df.loc[senator,'party'] = affiliation home_state = senator[-3:-1] sen_df.loc[senator,'state'] = home_state col = 'r' if affiliation == 'R' else ('b' if affiliation == 'D' else 'k') sen_df.loc[senator,'color'] = col return sen_df def fill_vote_agreement(sen_df, data): ct = 0 sen_names = [] for vote in data: ct += 1 #print 'this is vote #',ct,'out of',len(data),'total votes.' already_counted = [] for vote_type in vote['votes']: if (vote_type == 'Yea') or (vote_type == 'Nay'): for nodeA in vote['votes'][vote_type]: nameA = nodeA['display_name'] if (not nameA in sen_names): sen_names.append(nameA) for nodeB in vote['votes'][vote_type]: nameB = nodeB['display_name'] this_pair = np.sort([nameA, nameB]) if (not (this_pair[0], this_pair[1]) in already_counted) and (nameA != nameB): already_counted.append((this_pair[0], this_pair[1])) #if (sen_df.loc[ this_pair[0], this_pair[1] ] > 1): #print 'value:', sen_df.loc[ this_pair[0], this_pair[1] ] sen_df.loc[this_pair[0],this_pair[1]] = sen_df.loc[this_pair[0],this_pair[1]] + 1 #print this_pair[0], 'and', this_pair[1], #' count: ', sen_df.loc[ this_pair[0], this_pair[1] ] return sen_df """ Function -------- vote_graph Parameters ---------- data : list of dicts The vote database returned from get_vote_data Returns ------- graph : NetworkX Graph object, with the following properties 1. Each node in the graph is labeled using the `display_name` of a Senator (e.g., 'Lee (R-UT)') 2. Each node has a `color` attribute set to 'r' for Republicans, 'b' for Democrats, and 'k' for Independent/other parties. 3. The edges between two nodes are weighted by the number of times two senators have cast the same Yea or Nay vote 4. Each edge also has a `difference` attribute, which is set to `1 / weight`. Examples -------- >>> graph = vote_graph(vote_data) >>> graph.node['Lee (R-UT)'] {'color': 'r'} # attributes for this senator >>> len(graph['Lee (R-UT)']) # connections to other senators 101 >>> graph['Lee (R-UT)']['Baldwin (D-WI)'] # edge relationship between Lee and Baldwin {'difference': 0.02, 'weight': 50} """ #Your code here def vote_graph(data): ## in general i know it's not good to create globals like this ## but seeing as we have required inputs and outputs for this function, this is my workaround to having access ## to this df later on, when i create my graphs. global sen_df ## ## three helper functions ## ## get_senator_display_names retrieves a list of all the senators in "[surname] ([party]-[state])" format info = get_senator_display_names(data) ## i want one big pandas df to hold all my data ## fill_sen_df() just gets all the basic information for a given senator sen_df = fill_sen_df(info) ## fill_vote_agreement goes in and loops through each vote to collect edge weights ## then it dumps it in the 104x104 matrix which constitutes the tail end of my df df = fill_vote_agreement(sen_df, data) ## for use later on in graph creation sen_df = df ## initialize Graph object g = nx.Graph() ## add graph nodes (as senator surnames) for idx, name in enumerate(df['display']): g.add_node(name, surname=df.ix[idx,'surname'], color=df.ix[idx, 'color']) ## add edges (along with weight and difference parameters) ## we do this with a nested for loop - basically running through all senator names twice, concurrently ## for each unique pair with some voting relationship, add an edge to the network for nameA in df['display']: for nameB in df['display']: this_pair = np.sort([nameA,nameB]) if (nameA != nameB) and (df.loc[this_pair[0]][this_pair[1]] > 0): ## get edge weight from sen_df weight = sen_df.loc[this_pair[0]][this_pair[1]] ## add edge between co-voting senators ## include 'weight' and 'difference' attributes g.add_edge(this_pair[0], this_pair[1], weight=weight, difference=(1./weight)) return g votes = vote_graph(vote_data) ## test print votes.node['Lee (R-UT)'] print len(votes['Lee (R-UT)']) print votes['Lee (R-UT)']['Baldwin (D-WI)'] #this makes sure draw_spring results are the same at each call np.random.seed(1) color = [votes.node[senator]['color'] for senator in votes.nodes()] #determine position of each node using a spring layout pos = nx.spring_layout(votes, iterations=200, k=2) #plot the edges nx.draw_networkx_edges(votes, pos, alpha = .05) #plot the nodes nx.draw_networkx_nodes(votes, pos, node_color=color) #draw the labels lbls = nx.draw_networkx_labels(votes, pos, alpha=5, font_size=8) #coordinate information is meaningless here, so let's remove it plt.xticks([]) plt.yticks([]) remove_border(left=False, bottom=False) #Your code here spanning_tree = nx.minimum_spanning_tree(votes, weight='difference') ## recompute color assignments color = [spanning_tree.node[senator]['color'] for senator in spanning_tree.nodes()] ## ## repeat graph display code from above ## the only difference is k parameter for spring_layout ## #determine position of each node using a spring layout pos = nx.spring_layout(spanning_tree, iterations=200, k=.03) #plot the edges nx.draw_networkx_edges(spanning_tree, pos, alpha = .02) #plot the nodes nx.draw_networkx_nodes(spanning_tree, pos, node_color=color) #draw the labels lbls = nx.draw_networkx_labels(spanning_tree, pos, alpha=5, font_size=8) #coordinate information is meaningless here, so let's remove it plt.xticks([]) plt.yticks([]) remove_border(left=False, bottom=False) #Your code here ## compute closeness scores ## add to sen_df closeness = nx.closeness_centrality(votes, distance='difference') sen_df['closeness'] = np.zeros((104,1)) for sen in closeness.keys(): sen_df.ix[sen,'closeness'] = closeness[sen] close_df = sen_df.sort(columns='closeness') N = close_df.shape[0] ind = np.arange(N) ## the x locations for the groups width = .9 ## the width of the bars fig, ax = plt.subplots() ax.bar(ind, close_df['closeness'], width, color=list(close_df['color'].values)) ## with help from @1503 xlocs, senators = zip(*enumerate(close_df['surname'])) xticks_locs, xticks_labels = plt.xticks(xlocs, senators) plt.xlim(min(xlocs), max(xlocs)) ## add some labels ax.set_ylabel('Closeness score') ax.set_xlabel('Senator Surnames') ax.set_title('Closeness Centrality of US Senators in the 113th Congress') ax.set_xticklabels( close_df['surname'], rotation='vertical', fontsize=8) ## it's hard to get everyone on in one graph - so we make it wider fig.set_size_inches(19,5.5) plt.show() #your code here def across_aisle_graph(df): g_aisle = nx.Graph() for idx, name in enumerate(df['display']): g_aisle.add_node(name, surname=df.ix[idx,'surname'], color=df.ix[idx, 'color']) for nameA in df['display']: for nameB in df['display']: this_pair = np.sort([nameA,nameB]) if (nameA != nameB) and (df.loc[this_pair[0]][this_pair[1]] > 0) and (df.loc[nameA, 'party'] != df.loc[nameB, 'party']): ## get edge weight from sen_df weight = sen_df.loc[this_pair[0]][this_pair[1]] ## add edge between co-voting senators ## include 'weight' and 'difference' attributes g_aisle.add_edge(this_pair[0], this_pair[1], weight=weight, difference=(1./weight)) return g_aisle cross_aisle = across_aisle_graph(sen_df) np.random.seed(1) color = [cross_aisle.node[senator]['color'] for senator in cross_aisle.nodes()] #determine position of each node using a spring layout pos = nx.spring_layout(cross_aisle, iterations=200, k=2) #plot the edges nx.draw_networkx_edges(cross_aisle, pos, alpha = .05) #plot the nodes nx.draw_networkx_nodes(cross_aisle, pos, node_color=color) #draw the labels lbls = nx.draw_networkx_labels(cross_aisle, pos, alpha=5, font_size=8) #coordinate information is meaningless here, so let's remove it plt.xticks([]) plt.yticks([]) remove_border(left=False, bottom=False) #Your code here aisle_spanning_tree = nx.minimum_spanning_tree(cross_aisle, weight='difference') ## recompute color assignments color = [aisle_spanning_tree.node[senator]['color'] for senator in aisle_spanning_tree.nodes()] ## ## repeat graph display code from above ## the only difference is k parameter for spring_layout ## #determine position of each node using a spring layout pos = nx.spring_layout(aisle_spanning_tree, iterations=100, k=.4) #plot the edges nx.draw_networkx_edges(aisle_spanning_tree, pos, alpha = .02) #plot the nodes nx.draw_networkx_nodes(aisle_spanning_tree, pos, node_color=color) #draw the labels lbls = nx.draw_networkx_labels(aisle_spanning_tree, pos, alpha=5, font_size=8) #coordinate information is meaningless here, so let's remove it plt.xticks([]) plt.yticks([]) remove_border(left=False, bottom=False) path = 'images/before_minimizing.png' Image(path) path = 'images/after_minimizing.png' Image(path) """ Function -------- get_senate_bill Scrape the bill data from a single JSON page, given the bill number Parameters ----------- bill : int Bill number to fetch Returns ------- A dict, parsed from the JSON Examples -------- >>> bill = get_senate_bill(10) >>> bill['sponsor'] {u'district': None, u'name': u'Reid, Harry', u'state': u'NV', u'thomas_id': u'00952', u'title': u'Sen', u'type': u'person'} >>> bill['short_title'] u'Agriculture Reform, Food, and Jobs Act of 2013' """ #your code here def get_senate_bill(bill): url = ''.join(["https://www.govtrack.us/data/congress/113/bills/s/s",str(bill),"/data.json"]) #print 'Bill:',bill data = requests.get(url).json() return data ## test ## bill = get_senate_bill(10) ## print bill['sponsor'] ## print bill['short_title'] """ Function -------- get_all_bills Scrape all Senate bills at http://www.govtrack.us/data/congress/113/bills/s Parameters ---------- None Returns ------- A list of dicts, one for each bill """ #your code here from BeautifulSoup import BeautifulSoup import re def get_all_bills(): link = "https://www.govtrack.us/data/congress/113/bills/s/" data = requests.get(link).text soup = BeautifulSoup(data) ## Q: what does the regex mean? ## A: 's' and '/' are string literals ## \d is shorthand for [0-9] ## {1,4} means we need at least one digit, and up to 4 (we don't have more than 9,999 bills) pattern = re.compile("s\d{1,4}/") ## search for href tags with the above pattern senate_bills = soup.findAll(href=pattern) ## Q: why [1:-1] in vote['href']? ## A: first character (index 0) is 's', last character (index -1) is '/' ## so [1:-1] to extract just the number, which is what we need to pass to get_senate_bill() all_senate_bills = [] all_folders = [] bills = [ int(bill['href'][1:-1]) for bill in senate_bills ] bills = np.sort(bills) #print votes missed = [] for bill in bills: #print 'Getting votes for', vote try: all_senate_bills.append(get_senate_bill(str(bill))) except: #print 'failed on: ',vote missed.append(bill) return all_senate_bills bill_list = get_all_bills() """ Function -------- bill_graph Turn the bill graph data into a NetworkX Digraph Parameters ---------- data : list of dicts The data returned from get_all_bills Returns ------- graph : A NetworkX DiGraph, with the following properties * Each node is a senator. For a label, use the 'name' field from the 'sponsor' and 'cosponsors' dict items * Each edge from A to B is assigned a weight equal to how many bills are sponsored by B and co-sponsored by A """ #Your code here def bill_graph(data): g2 = nx.DiGraph() senators = set() for x in data: senators.add(x['sponsor']['name']) for cosponsor in x['cosponsors']: senators.add(cosponsor['name']) for name in senators: for x in data: sponsor = x['sponsor']['name'] for cosponsor in x['cosponsors']: if name == cosponsor['name']: if g2.has_edge(name,sponsor): g2.edge[name][sponsor]['weight'] += 1 else: if name != sponsor: g2.add_edge(name,sponsor, weight=1) #cosponsor_df.loc[name, sponsor] += 1 return g2 bills = bill_graph(bill_list) #Your code here pgrank = nx.pagerank_numpy(g2, weight='weight') ## sort pageranks from high to low scores ## with guidance from: http://stackoverflow.com/questions/613183/python-sort-a-dictionary-by-value sorted_pgrank = sorted(pgrank.iteritems(), key=itemgetter(1), reverse=True) ## print 5 highest pageranks print '\n\nThe 5 senators with highest PageRank scores:\n' for senator in sorted_pgrank[:5]: print 'Name:',senator[0], ' PageRank:', round(senator[1],4) print "\n\n" names, ranks = zip(*sorted_pgrank) surnames = [name.split(',')[0] for name in list(names)] N = len(names) ind = np.arange(N) ## the x locations for the groups width = .9 ## the width of the bars fig, ax = plt.subplots() ax.bar(ind, ranks, width) ## with help from @1503 xlocs, senators = zip(*enumerate(names)) xticks_locs, xticks_labels = plt.xticks(xlocs, senators) plt.xlim(min(xlocs), max(xlocs)) ## add some labels ax.set_ylabel('PageRank score') ax.set_xlabel('Senators') ax.set_title('Bill Authorship "PageRank" of US Senators in the 113th Congress') ax.set_xticklabels( surnames, rotation='vertical', fontsize=8) ## it's hard to get everyone on in one graph - so we make it wider fig.set_size_inches(19,5.5) plt.show() ## get list of node degrees in order of sorted names from pagerank ordering degrees = [g2.degree(name) for name in list(names)] ## make the same plot as above, but add a second set of bars for degree N = len(names) ind = np.arange(N) ## the x locations for the groups width = .9 ## the width of the bars fig = plt.figure() ax = fig.add_subplot(111) ax.bar(ind, ranks, width) ## here we add the extra set of bars for node degree ax2 = ax.twinx() ax2.bar(ind+width+0.35, degrees, 0.45, color='#deb0b0') xlocs, senators = zip(*enumerate(names)) xticks_locs, xticks_labels = plt.xticks(xlocs, senators) plt.xlim(min(xlocs), max(xlocs)) ax.set_xlabel('Senators') ax.set_title('Bill Authorship "PageRank" (blue) vs Node Degrees (pink) in 113th Senate') ax.set_xticklabels( surnames, rotation='vertical', fontsize=8 ) fig.set_size_inches(19,5.5) plt.show() path = 'images/reid_menendez.png' Image(path) path = 'images/ayotte.png' Image(path) ## Originally had trouble converting graph object, as attributes were all numpy objects ## But write_gexf() wanted base objects (float, string, etc) ## ## The discussion on @1570 led me to this page: ## https://groups.google.com/forum/#!msg/networkx-discuss/0K61tql2Hv4/cJtR2AV3jJYJ ## and the function I use below is taken directly from that thread. Very helpful! def fix_float_for_gexf(network): for u,v,d in network.edges_iter(data=True): for k,v in d.items(): if isinstance(d[k], np.float64): d[k] = float( d[k] ) #network.edge[u][v] = d for u,d in network.nodes_iter(data=True): for k,v in d.items(): if isinstance(d[k], np.float64): d[k] = float( d[k] ) #network.node[u] = d return network votes3 = fix_float_for_gexf(votes2) nx.write_gexf(votes3, 'votes.gexf') from IPython.display import Image path = 'images/senate.png' Image(path) path = 'images/linkedin_zoom_full.png' Image(path)