%matplotlib inline

import json
import pandas as pd
import numpy as np
import networkx as nx
import requests
from pattern import web
import matplotlib.pyplot as plt
from operator import itemgetter


# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'lightgray'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

"""
Function
--------
get_senate_vote

Scrapes a single JSON page for a particular Senate vote, given by the vote number

Parameters
----------
vote : int
   The vote number to fetch
   
Returns
-------
vote : dict
   The JSON-decoded dictionary for that vote
   
Examples
--------
>>> get_senate_vote(11)['bill']
{u'congress': 113,
 u'number': 325,
 u'title': u'A bill to ensure the complete and timely payment of the obligations of the United States Government until May 19, 2013, and for other purposes.',
 u'type': u'hr'}
"""
#your code here

def get_senate_vote(vote):
    
    url = ''.join(["https://www.govtrack.us/data/congress/113/votes/2013/s",vote,"/data.json"])
    data = requests.get(url).json()
    
    return data
    
    
"""
Function
--------
get_all_votes

Scrapes all the Senate votes from http://www.govtrack.us/data/congress/113/votes/2013,
and returns a list of dicts

Parameters
-----------
None

Returns
--------
votes : list of dicts
    List of JSON-parsed dicts for each senate vote
"""
#Your code here

## Q: Why use BeautifulSoup?
## A: pattern.web's documentation is awful. 
##    Even though it says its implementation is on top of BS, it doesn't behave perfectly.  
##    BS's 'findAll' method is great here, so i switched.

from BeautifulSoup import BeautifulSoup
import re

def get_all_votes():
    
    link = "https://www.govtrack.us/data/congress/113/votes/2013/"

    data = requests.get(link).text   
            
    soup = BeautifulSoup(data)
    
    ## Q: what does the regex mean?
    ## A: 's' and '/' are string literals
    ##    \d is shorthand for [0-9]
    ##    {1,4} means we need at least one digit, and up to 4 
    ##          (i'm pretty sure there aren't more than 999 votes, but definitely not more than 9999 votes!)
    
    pattern = re.compile("s\d{1,4}/")
    
    ## search for href tags with the above pattern
    
    senate_votes = soup.findAll(href=pattern)
    
    ## Q: why [1:-1] in vote['href']?
    ## A: first character (index 0) is 's', last character (index -1) is '/'
    ##    so [1:-1] to extract just the number, which is what we need to pass to get_senate_vote()
    all_senate_votes = []
    all_folders = []
    
    votes = [ int(vote['href'][1:-1]) for vote in senate_votes ]
        
    votes = np.sort(votes)
    
    ## for the first day i worked on this function, i had a hell of a time getting all the way through 
    ## without the site crashing on me.  since then, it hasn't been a problem, although my code didn't change at all.
    ## anyway, the missed[] array here is for catching errors, just in case the problem happens again.
    missed = []
    
    for vote in votes:
        
        try:
            all_senate_votes.append(get_senate_vote(str(vote)))
        except:
            missed.append(vote)
        
    return all_senate_votes


vote_data = get_all_votes()

def get_senator_display_names(data):
    
    info = set()
    for vote in data:
        
        for ix in xrange(4):
            
            for vote_type in vote['votes']:
                
                if (vote_type == 'Yea') or (vote_type == 'Nay'):
                    
                    for datum in vote['votes'][vote_type]:
                        
                        senator_info = datum['display_name']
                        info.add(senator_info)  ## get unique list of senator names
                        
    return info

## test
## info = get_senator_display_names(vote_data)
## print list(info)[:2]

def fill_sen_df(info):
    
    ## set up df for all senator info (including vote agreement counts)
    ## it's a 104 row X 108 column matrix 
    ##        (the first 4 columns are string data about each senator)
    
    
    cols = ['display','surname','party','color','state']
    cols.extend(list(info))
    
    sen_df = pd.DataFrame(index=info, 
                          columns = cols, 
                          data = np.zeros((len(info),len(info)+5)))       
    
    ## change the first few columns to string dtype
    sen_df[['display','surname', 'party', 'color', 'state']] = sen_df[['display','surname', 'party', 'color', 'state']].astype(str)
    
    for senator in info:
        
        sen_df.loc[senator, 'display'] = senator
        
        surname = senator[:-7]
        sen_df.loc[senator,'surname'] = surname
        
        affiliation = re.search("\(([RDI]{1})", senator).groups()[0]
        sen_df.loc[senator,'party'] = affiliation
        
        home_state = senator[-3:-1]
        sen_df.loc[senator,'state'] = home_state
        
        col = 'r' if affiliation == 'R' else ('b' if affiliation == 'D' else 'k')
        sen_df.loc[senator,'color'] = col

    return sen_df

def fill_vote_agreement(sen_df, data):
    
    ct = 0
    
    sen_names = []
    
    for vote in data:
        
        ct += 1
        #print 'this is vote #',ct,'out of',len(data),'total votes.'
        
        already_counted = []
        
        for vote_type in vote['votes']:
            
            if (vote_type == 'Yea') or (vote_type == 'Nay'):
                
                for nodeA in vote['votes'][vote_type]:
                    
                    nameA = nodeA['display_name']
                    
                    if (not nameA in sen_names):
                    
                        sen_names.append(nameA)


                    for nodeB in vote['votes'][vote_type]:
                        
                        nameB = nodeB['display_name']
                        
                        this_pair = np.sort([nameA, nameB])
                        
                        if (not (this_pair[0], this_pair[1]) in already_counted) and (nameA != nameB):
                            
                            already_counted.append((this_pair[0], this_pair[1]))
                            #if (sen_df.loc[ this_pair[0], this_pair[1] ] > 1): 
                                #print 'value:', sen_df.loc[ this_pair[0], this_pair[1] ]
                            
                            sen_df.loc[this_pair[0],this_pair[1]] = sen_df.loc[this_pair[0],this_pair[1]] + 1
                            
                            #print this_pair[0], 'and', this_pair[1], 
                            #' count: ', sen_df.loc[ this_pair[0], this_pair[1] ]
                            
    return sen_df

"""
Function
--------
vote_graph

Parameters
----------
data : list of dicts
    The vote database returned from get_vote_data

Returns
-------
graph : NetworkX Graph object, with the following properties
    1. Each node in the graph is labeled using the `display_name` of a Senator (e.g., 'Lee (R-UT)')
    2. Each node has a `color` attribute set to 'r' for Republicans, 
       'b' for Democrats, and 'k' for Independent/other parties.
    3. The edges between two nodes are weighted by the number of 
       times two senators have cast the same Yea or Nay vote
    4. Each edge also has a `difference` attribute, which is set to `1 / weight`.

Examples
--------
>>> graph = vote_graph(vote_data)
>>> graph.node['Lee (R-UT)']
{'color': 'r'}  # attributes for this senator
>>> len(graph['Lee (R-UT)']) # connections to other senators
101
>>> graph['Lee (R-UT)']['Baldwin (D-WI)']  # edge relationship between Lee and Baldwin
{'difference': 0.02, 'weight': 50}
"""
#Your code here

def vote_graph(data):
    
    ## in general i know it's not good to create globals like this
    ## but seeing as we have required inputs and outputs for this function, this is my workaround to having access
    ## to this df later on, when i create my graphs.
    
    global sen_df
    
    ##
    ## three helper functions
    ##
    ## get_senator_display_names retrieves a list of all the senators in "[surname] ([party]-[state])" format
    info = get_senator_display_names(data)
    
    ## i want one big pandas df to hold all my data
    
    ## fill_sen_df() just gets all the basic information for a given senator
    sen_df = fill_sen_df(info)
    
    ## fill_vote_agreement goes in and loops through each vote to collect edge weights
    ## then it dumps it in the 104x104 matrix which constitutes the tail end of my df
    df = fill_vote_agreement(sen_df, data)
    
    
    ## for use later on in graph creation
    sen_df = df
    
    ## initialize Graph object
    g = nx.Graph()

    
    ## add graph nodes (as senator surnames)
    for idx, name in enumerate(df['display']):
        
        g.add_node(name, surname=df.ix[idx,'surname'], color=df.ix[idx, 'color'])
    
    ## add edges (along with weight and difference parameters)
    
    ## we do this with a nested for loop - basically running through all senator names twice, concurrently
    ## for each unique pair with some voting relationship, add an edge to the network
    
    for nameA in df['display']:
        
        for nameB in df['display']:
            
            this_pair = np.sort([nameA,nameB])
            
            if (nameA != nameB) and (df.loc[this_pair[0]][this_pair[1]] > 0):
            
                ## get edge weight from sen_df
                weight = sen_df.loc[this_pair[0]][this_pair[1]]
                               
                ## add edge between co-voting senators
                ## include 'weight' and 'difference' attributes
                g.add_edge(this_pair[0], this_pair[1], weight=weight, difference=(1./weight))                    
    
    return g

votes = vote_graph(vote_data)

## test
print votes.node['Lee (R-UT)']
print len(votes['Lee (R-UT)'])                    
print votes['Lee (R-UT)']['Baldwin (D-WI)']                    

#this makes sure draw_spring results are the same at each call
np.random.seed(1)  

color = [votes.node[senator]['color'] for senator in votes.nodes()]

#determine position of each node using a spring layout
pos = nx.spring_layout(votes, iterations=200, k=2)

#plot the edges
nx.draw_networkx_edges(votes, pos, alpha = .05)

#plot the nodes
nx.draw_networkx_nodes(votes, pos, node_color=color)

#draw the labels
lbls = nx.draw_networkx_labels(votes, pos, alpha=5, font_size=8)

#coordinate information is meaningless here, so let's remove it
plt.xticks([])
plt.yticks([])
remove_border(left=False, bottom=False)

#Your code here
spanning_tree = nx.minimum_spanning_tree(votes, weight='difference')

## recompute color assignments
color = [spanning_tree.node[senator]['color'] for senator in spanning_tree.nodes()]

##
## repeat graph display code from above
## the only difference is k parameter for spring_layout
##

#determine position of each node using a spring layout
pos = nx.spring_layout(spanning_tree, iterations=200, k=.03)

#plot the edges
nx.draw_networkx_edges(spanning_tree, pos, alpha = .02)

#plot the nodes
nx.draw_networkx_nodes(spanning_tree, pos, node_color=color)

#draw the labels
lbls = nx.draw_networkx_labels(spanning_tree, pos, alpha=5, font_size=8)

#coordinate information is meaningless here, so let's remove it
plt.xticks([])
plt.yticks([])
remove_border(left=False, bottom=False)

#Your code here

## compute closeness scores
## add to sen_df

closeness = nx.closeness_centrality(votes, distance='difference')
sen_df['closeness'] = np.zeros((104,1))
for sen in closeness.keys():
    
    sen_df.ix[sen,'closeness'] = closeness[sen]
    
close_df = sen_df.sort(columns='closeness')

N = close_df.shape[0]
ind = np.arange(N)  ## the x locations for the groups
width = .9       ## the width of the bars

fig, ax = plt.subplots()
ax.bar(ind, close_df['closeness'], width, color=list(close_df['color'].values))

## with help from @1503
xlocs, senators = zip(*enumerate(close_df['surname']))
xticks_locs, xticks_labels = plt.xticks(xlocs, senators)
plt.xlim(min(xlocs), max(xlocs))

## add some labels
ax.set_ylabel('Closeness score')
ax.set_xlabel('Senator Surnames')
ax.set_title('Closeness Centrality of US Senators in the 113th Congress')
ax.set_xticklabels( close_df['surname'], rotation='vertical', fontsize=8)

## it's hard to get everyone on in one graph - so we make it wider
fig.set_size_inches(19,5.5)
plt.show()

#your code here
def across_aisle_graph(df):

    g_aisle = nx.Graph()
    
    for idx, name in enumerate(df['display']):
        
        g_aisle.add_node(name, surname=df.ix[idx,'surname'], color=df.ix[idx, 'color'])
        
    for nameA in df['display']:
        
        for nameB in df['display']:
            
            this_pair = np.sort([nameA,nameB])
            if (nameA != nameB) and (df.loc[this_pair[0]][this_pair[1]] > 0) and (df.loc[nameA, 'party'] != df.loc[nameB, 'party']):
            
                ## get edge weight from sen_df
                weight = sen_df.loc[this_pair[0]][this_pair[1]]
                
                ## add edge between co-voting senators
                ## include 'weight' and 'difference' attributes
                g_aisle.add_edge(this_pair[0], this_pair[1], weight=weight, difference=(1./weight))                    
    
    return g_aisle

cross_aisle = across_aisle_graph(sen_df)


np.random.seed(1)  

color = [cross_aisle.node[senator]['color'] for senator in cross_aisle.nodes()]

#determine position of each node using a spring layout
pos = nx.spring_layout(cross_aisle, iterations=200, k=2)

#plot the edges
nx.draw_networkx_edges(cross_aisle, pos, alpha = .05)

#plot the nodes
nx.draw_networkx_nodes(cross_aisle, pos, node_color=color)

#draw the labels
lbls = nx.draw_networkx_labels(cross_aisle, pos, alpha=5, font_size=8)

#coordinate information is meaningless here, so let's remove it
plt.xticks([])
plt.yticks([])
remove_border(left=False, bottom=False)

#Your code here
aisle_spanning_tree = nx.minimum_spanning_tree(cross_aisle, weight='difference')

## recompute color assignments
color = [aisle_spanning_tree.node[senator]['color'] for senator in aisle_spanning_tree.nodes()]

##
## repeat graph display code from above
## the only difference is k parameter for spring_layout
##

#determine position of each node using a spring layout
pos = nx.spring_layout(aisle_spanning_tree, iterations=100, k=.4)

#plot the edges
nx.draw_networkx_edges(aisle_spanning_tree, pos, alpha = .02)

#plot the nodes
nx.draw_networkx_nodes(aisle_spanning_tree, pos, node_color=color)

#draw the labels
lbls = nx.draw_networkx_labels(aisle_spanning_tree, pos, alpha=5, font_size=8)

#coordinate information is meaningless here, so let's remove it
plt.xticks([])
plt.yticks([])
remove_border(left=False, bottom=False)

path = 'images/before_minimizing.png'
Image(path)

path = 'images/after_minimizing.png'
Image(path)

"""
Function
--------
get_senate_bill

Scrape the bill data from a single JSON page, given the bill number

Parameters
-----------
bill : int
   Bill number to fetch
   
Returns
-------
A dict, parsed from the JSON

Examples
--------
>>> bill = get_senate_bill(10)
>>> bill['sponsor']
{u'district': None,
 u'name': u'Reid, Harry',
 u'state': u'NV',
 u'thomas_id': u'00952',
 u'title': u'Sen',
 u'type': u'person'}
>>> bill['short_title']
u'Agriculture Reform, Food, and Jobs Act of 2013'
"""
#your code here
def get_senate_bill(bill):
    
    url = ''.join(["https://www.govtrack.us/data/congress/113/bills/s/s",str(bill),"/data.json"])
    #print 'Bill:',bill
    data = requests.get(url).json()
    
    return data

## test
## bill = get_senate_bill(10)
## print bill['sponsor']
## print bill['short_title']

"""
Function
--------
get_all_bills

Scrape all Senate bills at http://www.govtrack.us/data/congress/113/bills/s

Parameters
----------
None

Returns
-------
A list of dicts, one for each bill
"""
#your code here
from BeautifulSoup import BeautifulSoup
import re

def get_all_bills():
    
    link = "https://www.govtrack.us/data/congress/113/bills/s/"

    data = requests.get(link).text   

            
    soup = BeautifulSoup(data)
    
    ## Q: what does the regex mean?
    ## A: 's' and '/' are string literals
    ##    \d is shorthand for [0-9]
    ##    {1,4} means we need at least one digit, and up to 4 (we don't have more than 9,999 bills)
    
    pattern = re.compile("s\d{1,4}/")
    
    ## search for href tags with the above pattern
    
    senate_bills = soup.findAll(href=pattern)
    
    ## Q: why [1:-1] in vote['href']?
    ## A: first character (index 0) is 's', last character (index -1) is '/'
    ##    so [1:-1] to extract just the number, which is what we need to pass to get_senate_bill()
    
    all_senate_bills = []
    all_folders = []
    
    bills = [ int(bill['href'][1:-1]) for bill in senate_bills ]
        
    bills = np.sort(bills)
    #print votes
    
    missed = []
    
    for bill in bills:
        
        #print 'Getting votes for', vote
        
        try:
            all_senate_bills.append(get_senate_bill(str(bill)))
        except:
            #print 'failed on: ',vote
            missed.append(bill)
        
    return all_senate_bills


bill_list = get_all_bills()

"""
Function
--------
bill_graph

Turn the bill graph data into a NetworkX Digraph

Parameters
----------
data : list of dicts
    The data returned from get_all_bills

Returns
-------
graph : A NetworkX DiGraph, with the following properties
    * Each node is a senator. For a label, use the 'name' field 
      from the 'sponsor' and 'cosponsors' dict items
    * Each edge from A to B is assigned a weight equal to how many 
      bills are sponsored by B and co-sponsored by A
"""
#Your code here
def bill_graph(data):
    
    g2 = nx.DiGraph()  
    
    senators = set()
    
    for x in data:
        senators.add(x['sponsor']['name'])
        for cosponsor in x['cosponsors']:
            senators.add(cosponsor['name'])
            
    for name in senators:
        
        for x in data:
            
            sponsor = x['sponsor']['name']
            
            for cosponsor in x['cosponsors']:
                
                if name == cosponsor['name']:
                    
                    if g2.has_edge(name,sponsor):
                        g2.edge[name][sponsor]['weight'] += 1
                        
                    
                    else:
                        if name != sponsor:
                            g2.add_edge(name,sponsor, weight=1)
                            #cosponsor_df.loc[name, sponsor] += 1

    return g2

bills = bill_graph(bill_list)

#Your code here

pgrank = nx.pagerank_numpy(g2, weight='weight')

## sort pageranks from high to low scores
##      with guidance from: http://stackoverflow.com/questions/613183/python-sort-a-dictionary-by-value
sorted_pgrank = sorted(pgrank.iteritems(), key=itemgetter(1), reverse=True)


## print 5 highest pageranks
print '\n\nThe 5 senators with highest PageRank scores:\n'
for senator in sorted_pgrank[:5]:

    print 'Name:',senator[0], '    PageRank:', round(senator[1],4)
 
print "\n\n"
names, ranks = zip(*sorted_pgrank)
surnames = [name.split(',')[0] for name in list(names)]
    
N = len(names)
ind = np.arange(N)  ## the x locations for the groups
width = .9       ## the width of the bars

fig, ax = plt.subplots()
ax.bar(ind, ranks, width)

## with help from @1503
xlocs, senators = zip(*enumerate(names))
xticks_locs, xticks_labels = plt.xticks(xlocs, senators)
plt.xlim(min(xlocs), max(xlocs))

## add some labels
ax.set_ylabel('PageRank score')
ax.set_xlabel('Senators')
ax.set_title('Bill Authorship "PageRank" of US Senators in the 113th Congress')
ax.set_xticklabels( surnames, rotation='vertical', fontsize=8)

## it's hard to get everyone on in one graph - so we make it wider
fig.set_size_inches(19,5.5)
plt.show()

## get list of node degrees in order of sorted names from pagerank ordering
degrees = [g2.degree(name) for name in list(names)]

## make the same plot as above, but add a second set of bars for degree

N = len(names)
ind = np.arange(N)  ## the x locations for the groups
width = .9       ## the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

ax.bar(ind, ranks, width)


## here we add the extra set of bars for node degree
ax2 = ax.twinx()
ax2.bar(ind+width+0.35, degrees, 0.45, color='#deb0b0')


xlocs, senators = zip(*enumerate(names))
xticks_locs, xticks_labels = plt.xticks(xlocs, senators)
plt.xlim(min(xlocs), max(xlocs))

ax.set_xlabel('Senators')
ax.set_title('Bill Authorship "PageRank" (blue) vs Node Degrees (pink) in 113th Senate')
ax.set_xticklabels( surnames, rotation='vertical', fontsize=8 )


fig.set_size_inches(19,5.5)
plt.show()

path = 'images/reid_menendez.png'
Image(path)

path = 'images/ayotte.png'
Image(path)

## Originally had trouble converting graph object, as attributes were all numpy objects
## But write_gexf() wanted base objects (float, string, etc)
## 
## The discussion on @1570 led me to this page:
## https://groups.google.com/forum/#!msg/networkx-discuss/0K61tql2Hv4/cJtR2AV3jJYJ
## and the function I use below is taken directly from that thread.  Very helpful!

def fix_float_for_gexf(network): 

    for u,v,d in network.edges_iter(data=True): 

        for k,v in d.items(): 

            if isinstance(d[k], np.float64): 

                d[k] = float( d[k] ) 

        #network.edge[u][v] = d 

    for u,d in network.nodes_iter(data=True): 

        for k,v in d.items(): 

            if isinstance(d[k], np.float64): 

                d[k] = float( d[k] ) 

        #network.node[u] = d 

    return network

votes3 = fix_float_for_gexf(votes2)

nx.write_gexf(votes3, 'votes.gexf')

from IPython.display import Image

path = 'images/senate.png'
Image(path)

path = 'images/linkedin_zoom_full.png'
Image(path)