# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

#Array processing
import numpy as np
#Data analysis, wrangling and common exploratory operations
import pandas as pd
from pandas import Series, DataFrame

#A sane way to get web data
import requests

#Packages for web scraping. No need to use both. Feel free to use one of them.
from pattern import web
from BeautifulSoup import BeautifulSoup

#For visualization. Matplotlib for basic viz and seaborn for more stylish figures + statistical figures not in MPL.
import matplotlib.pyplot as plt
import seaborn as sns

#For some of the date operations
import datetime

#Input:
#    url: URL of a wikipedia page
#    table_name: Name of the table to scrape
#Output:
#    df is a Pandas data frame that contains a tabular representation of the table
#    The columns are named the same as the table columns
#    Each row of df corresponds to a row in the table
def scraping_wikipedia_table(url, table_name):
    return None

#Input:
#    url: URL of a Walmart results page for a search query in Movies department
#Output:
#    df is a Pandas data frame that contains a tabular representation of the results
#    The df must have 9 columns that must have the same name and data type as described above
#    Each row of df corresponds to a movie in the results table
def scraping_walmart_movies(url):
    return None

#Input: dom - DOM of the books page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/books
#Output: An array (Python list) of books listed in the profile page. 
#    Note that this function requires a list as an output not a Pandas data frame
def scraping_facebook_books(dom):
    return []

#Input: dom - DOM of the groups page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/groups
#Output: A Pandas data frame with one row per group. 
#    It must have three columns - 'Group Name', 'Number of Members', 'Group Description'
#    Note that all information of a group is in the same page (eg. https://www.facebook.com/zuck/groups)
#    You can collect all data from same page even if they are incomplete (such as group description)
#    Ensure that the column names as given above
def scraping_facebook_groups(dom):
    return None

#Input: dom - DOM of the music page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/music
#Output: A Pandas data frame with one row per group. 
#    It must have four columns 
#    'Name', 'Type' (eg. Musician/Band or Bands&Musicians) and 'Verified' (boolean: True/False), 'Profile Url'
#    Note that all information of a group is in the same page (eg. https://www.facebook.com/zuck/music)
#    Ensure that the column names as given above
def scraping_facebook_music(dom):
    return None

#Let us now make things little bit more harder. 
#In all previous cases, you only had to collect information from a single page.
# But in reality, you have to collect information and integrate from multiple pages.
# Let us try a simple version of such data integration 

#Input: dom - DOM of the music page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/movies
#Output: A Pandas data frame with one row per group. 
#    It must have following columns - 
#        'Name', 'Type' (eg. Movie), 'Verified', 'Profile Url' - as before
#        'Likes', 'Starring', 'Genre', 'Director', 'Movie URL'

#    The first three columns can be obtained from https://www.facebook.com/zuck/movies
#    Once you get the profile url, obtain the HTML of this url and use this content to obtain the last 5 column data
#    For example, Zuckerberg likes 'The Matrix' (great movie btw). 
#    Then you get its profile url 'https://www.facebook.com/TheMatrixMovie?ref=profile'
#    Get the text of this url using requests package and parse information from the About tab
#    Ensure that the column names as given above
def scraping_facebook_movies(dom):
    return None

#read the csv file into a Pandas data frame
fec_all = pd.read_csv('fec_2012_contribution_subset.csv', low_memory=False)

#We will be doing party wise analysis later. So, we want to associate each candidate with their party
parties = {'Bachmann, Michelle': 'Republican',
           'Cain, Herman': 'Republican',
           'Gingrich, Newt': 'Republican',
           'Huntsman, Jon': 'Republican',
           'Johnson, Gary Earl': 'Republican',
           'McCotter, Thaddeus G': 'Republican',
           'Obama, Barack': 'Democrat',
           'Paul, Ron': 'Republican',
           'Pawlenty, Timothy': 'Republican',
           'Perry, Rick': 'Republican',
           "Roemer, Charles E. 'Buddy' III": 'Republican',
           'Romney, Mitt': 'Republican',
           'Santorum, Rick': 'Republican'}

#create a new column called party that sets the value to the party of the candidate
# The way this line works is as follows:
#  1. fec_all.cand_nm gives a vector (or Series in Pandas terminology)
#  2. For each row, the code looks up the candidate name to the dictionary parties
#  3. If the name of the candidate (cand_nm) is in parties, it returns the value (i.e. Republican or Democrat)
#  4. This whole thing is done for each row and you get another vector as output
#  5. Finally, you create a new column in fec_all called 'party' and assign the vector you got to it.
#  6. All in a single line :)
fec_all['party'] = fec_all.cand_nm.map(parties)

#ignore the refunds
# Get the subset of dataset where contribution amount is positive
fec_all = fec_all[fec_all.contb_receipt_amt > 0]

#fec_all contains details about all presidential candidates. 
#fec contains the details about contributions to Barack Obama and Mitt Romney only
#for the rest of the tasks, unless explicitly specified, work on the fec data frame.
fec = fec_all[fec_all.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]


#Task 1a: print the details of the data frame. 
# Basically, this should contain information such as number of rows,columns, name of columns, #non-null values for each column etc
# Hint: there is a Pandas function to do this
#replace None with your code
print "Task 1a: Details of FEC data frame are: \n", None 

#Task 1b: finding the number of rows and columns in the data frame.
#Hint: find the property of the data frame that gives the number of rows and columns
#replace None with your code
t1b_num_rows = None
t1b_num_cols = None
print "Task 1b: #Rows=%s, #Columns=%s" % (t1b_num_rows, t1b_num_cols) 

#Task 1c: The only numeric data is 'contb_receipt_amt' which is the amount of contribution. 
# Print the descriptive details (min, max, quantiles etc) for 'contb_receipt_amt'
#Hint: as above there is a direct pandas command for it.
#replace None with your code
print "Task 1c: Descriptive details of contb_receipt_amt is \n", None

#Task 1d: Let us now print the number of unique values for few columns
#Hint: Look for a Pandas function to do this.
t1d_num_uniq_cand_id = None
t1d_num_uniq_cand_nm = None
t1d_num_uniq_contbr_city = None
t1d_num_uniq_contbr_st = None
print "Task 1d: #Uniq cand_id = ", t1d_num_uniq_cand_id
print "Task 1d: #Uniq cand_num = ", t1d_num_uniq_cand_nm
print "Task 1d: #Uniq contbr_city = ", t1d_num_uniq_contbr_city
print "Task 1d: #Uniq contbr_st = ", t1d_num_uniq_contbr_st

#Task 2a: Let us find out how much contributions did Obama and Romney made in this dataset
# Remember, that this is not the complete amount as it excludes other sources like PACs, Super PACs and 
#  spending by party committes.
#Hint: use cand_nm field
t2a_tot_amt_obama = None
t2a_tot_amt_romney = None
print "Task 2a: Total Contribution for Obama is %s and for Romney is %s" % (t2a_tot_amt_obama, t2a_tot_amt_romney)

#Task 2b: How much contribution did folks from California, New York and Texas make totally (i.e. to both Obama and Romney).
#use contbr_st field
t2b_tot_amt_CA = None
t2b_tot_amt_NY = None
t2b_tot_amt_TX = None
print "Task 2b: Total contributions from California is %s, New York is %s and Texas is %s" % (t2b_tot_amt_CA, t2b_tot_amt_NY, t2b_tot_amt_TX)

#Task 2c: Let us now use multiple filtering criteria
# How much money did folks from Texas made to BO and MR?
# How much money did folks from UT Arlington made to BO and MR?
t2c_tot_contr_tx_bo = None
t2c_tot_contr_tx_mr = None
t2c_tot_contr_uta_bo = None
t2c_tot_contr_uta_mr = None

print "Task 2c: From TX, BO got %s and MR got %s dollars" % (t2c_tot_contr_tx_bo, t2c_tot_contr_tx_mr)
print "Task 2c: From UTA, BO got %s and MR got %s dollars" % (t2c_tot_contr_uta_bo, t2c_tot_contr_uta_mr)

#Task 2d: How much did Engineers from Google gave to BO and MR.
# This task is a bit tricky as there are many variations: eg, SOFTWARE ENGINEER vs ENGINEER and GOOGLE INC. vs GOOGLE
t2d_tot_engr_goog_bo = None
t2d_tot_engr_goog_mr = None
print "Task 2d: From Google Engineers, BO got %s and MR got %s dollars" % (t2d_tot_engr_goog_bo, t2d_tot_engr_goog_mr)

#Task 3a: For each state, print the total contribution they made to both candidates. 
t3a_state_contr_both = None
print "Task 3a: Total contribution made to both candidates by each state are", t3a_state_contr_both

#Task 3b: Now let us limit ourselves to TX. For each city in TX, print the total contribution made to both candidates
t3b_tx_city_contr_both = None
print "Task 3b: Total contribution made to both candidates by each city in TX are", t3b_tx_city_contr_both

#Task 3c: Now let us zoom into  Arlington, TX. For each zipcode in Arlington, print the total contribution made to both candidates
t3c_arlington_contr_both = None
print "Task 3c: Total contribution made to both candidates by each zipcode in Arlington are", t3c_arlington_contr_both

#Task 4a: Print the number of contributors to Obama in each state.
t4a_num_contr_obama_per_state = None
print "Number of contributions to Obama in each state is ", t4a_num_contr_obama_per_state

#Task 4b: Print the top-10 states (based on number of contributors) that contributed to Obama.
# print both state name and number of contributors
t4b_top10_obama_contr_states = None
print "Top-10 states with most contributors to Obama are ", t4b_top10_obama_contr_states

#Task 4c: Print the top-20 occupations that contributed overall (to both BO and MR)
t4c_top20_contr_occupation = None
print "Top-20 Occupations with most contributors are ", t4c_top20_contr_occupation

#Task 4d: Print the top-10 Employers that contributed overall (to both BO and MR)
t4d_top10_contr_employer_all = None
print "Top-10 Employers with most contributors are ", t4d_top10_contr_employer_all

#Task 5a: Draw a "horizontal" bar chart with one bar each for Obama and Romney with the value corresponding to total amount they raised.
# Remember to make the bar chart into a horizontal bar chart
#########################begin code for Task 5a
#########################end code for Task 5a


#Task 5b: Draw the "horizontal" bar chart of total NUMBER of contributions made per Candidate.
# ie Candidate name vs number of contributions for that candidate
#########################begin code for Task 5b
#########################end code for Task 5b

#Task 5c: Draw the "horizontal" bar chart of average contributions made per Candidate.
# ie Candidate Name vs avg contribution
#########################begin code for Task 5c
#########################end code for Task 5c


#Task 5d: Draw a "horizontal" bar chart that lists the top-10 states based on the TOTAL contribution they made to both candidates
# For each state, draw two lines - one in blue for Obama and one in red for Romney
# Display the proportion of the total contribution that came from the state.
# For eg, if Obama made 1 billion and TX gave 100 million of it, the proportion is 10% 
# Remember to make the bar chart into a horizontal bar chart
#########################begin code for Task 5d
#########################end code for Task 5d


#Task 5e: Now repeat the same process based on Occupation (again top-10)
#########################begin code for Task 5e
#########################end code for Task 5e


#Task 5f: Now repeat the same process based on Employers (again top-10)
#########################begin code for Task 5f
#########################end code for Task 5f


#Task 5g: Draw the histogram of total NUMBER of contributions made per each state.
# X-axis : state, Y-axis : number of contribution from that state
#########################begin code for Task 5g
#########################end code for Task 5g


#Task 5h: Draw a histogram of actual contributions made for Obama. Set bin size as 50
#X-axis: contribution amount bin, Y-axis: frequency
#########################begin code for Task 5h
#########################end code for Task 5h


#Task 5i: Draw a histogram of actual contributions made for Romney. Set bin size as 50
#X-axis: contribution amount bin, Y-axis: frequency
#########################begin code for Task 5i
#########################end code for Task 5i


#Harder
#Task 5j: Draw a line chart showing how the campaign contribution of Obama and Romney varied every quarter
#Use blue for Obama and red for Romney
#This is a bit tricky because, you have to convert contribution date to quarter.
#You can either do it on your own or see if Pandas has some function
#########################begin code for Task 5j
#########################end code for Task 5j


#Harder
#Task 5k: Draw a line chart showing the CUMULATIVE campaign contribution of Obama and Romney varied every quarter
# In other words, if Obama made X, Y, Z in first, second and third quarters
#  then plot X for first quarter, X+Y for second quarter and X+Y+Z for third quarter.
#Use blue for Obama and red for Romney
#This is a bit tricky because, you have to convert contribution date to quarter.
#You can either do it on your own or see if Pandas has some function
#########################begin code for Task 5k
#########################end code for Task 5k

#Tasks 5l and 5m
#Repeat 5j and 5k but do it for NUMBER of contributors
#In other words, 5l plots the number of contributors for Obama and Romney, quarter over quarter
#5m plots cumulative number of contributors quarter over quarter.

#########################begin code for Task 5l
#########################end code for Task 5l


#########################begin code for Task 5m
#########################end code for Task 5m


#The following set of tasks are a bit tricky: 
#    you need to use multiple commands to achieve. Specifically look at cut, groupby and unstack

#Task 6a: Discretize the contributions of Obama and Romney based on the bins given below.
# For example, if Obama got contributions such as 2, 6, 16, 18, 120, then he has 
#  0 contribution in (0,1], 2 contributions in (1,10], 2 contributions in (10, 100] and 1 contribution in (100, 1000]
bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
labels = None # set the variable labels to the output of pd.cut
grouped = None #Group the data based on labels and candidate names
#Replace None below in the print statement with grouped.size().unstack(0) . 
#    If your code for labels and grouped is correct, this should print number of people in each bin per candidate
print None 

#Task 6b: In Task 6a, we calculated the COUNT (i.e. the number of contributors in each bin)
# This by itself is not sufficient.
# In this task, let us compute the TOTAL AMOUNT in each bucket.
# Specifically, compute for each candidate the total amount of contributions that each got in each bucket.
# Continuing the example above, Obama's total contribution in each bucket is
#  0 in (0,1], 8 in (1,10], 34 in (10, 100] and 120 in (100, 1000]
#This could be done in 1 line from the variable grouped above
t6b_bucket_sums = None
print t6b_bucket_sums


#Task 6c: Even this does not fully specify the disparity in the funding scenario.
# To see this let us now compute the PROPORTION of total contribution in each bucket
# This is called normalization and is a common operation. 
# Typically normalization is done over the same candidate.
# But for the point I am trying to make, let us normalize within bucket.
# For example, if obama made X in a bucket and Y in another bucket, 
#    then normalization will give X/(X+Y) for Obama and Y/(X+Y) for Romney
# This is also quite easy in Pandas and can be done in 1 line
# The key idea is to realize that sum function takes an argument called axis
#  that does different things for axis=0 and axis=1 (figure out which does what)
t6c_normed_bucket_sums = None
print t6c_normed_bucket_sums

#Once you have done this , uncomment the following line to a horizontal stacked bar chart
#t6c_normed_bucket_sums[:-2].plot(kind='barh', stacked=True)


#Task 6d: Let us go back and try to do the other analysis
# Let us now try to see what PROPORTION of each candidate's amount came from each bucket.
#  This is the classical case of normalization.
#  Continuing the example above, Obama has made 0+8+34+120=162
#  We can normalize it by diving each bucket by the total
#   For example, 0/162, 8/162, 34/162 and 120/162.
#  If you finished t6c, then this just a matter of playing with axis values.
t6d_normed_bucket_sums = None
print t6d_normed_bucket_sums


#Once you have done this , uncomment the following line to a horizontal stacked bar chart
#t6d_normed_bucket_sums.plot(kind='barh', stacked=True)


#Task 7a: Write two functions: one for Obama and one for Romney that does the following:
# Given a value of N (N can be 1, 2, 5, 10 etc), sort the contributions made to the candidate in decreasing order
#  Then find how much contribution the top-N% made
#  Then compute the fraction to the overall campaign collection
# For example, if Obama collected 1 billion dollars and the top-1% gave 100 million then their contribution is 10%

def t7a_contributions_by_top_N_pct_obama(N):
    return None

def t7a_contributions_by_top_N_pct_romney(N):
    return None


for N in [1, 2, 5, 10, 20]:
    print "N=%s, Obama proportion=%s and Romney proportion = %s" % (N, t7a_contributions_by_top_N_pct_obama(N), t7a_contributions_by_top_N_pct_romney(N))

    
#Task 7b: Now let us see who these people in 1% are
# Compute the top-1% based on total contribution (ie. money they gave to Obama + Romney)
# Now let us see some information about them.
# For each of the folks in 1%, print the following:
#    name, state, occupation, employer, total amount they gave to Obama, total amount they gave to Romney
t7b_1pcters = None
print t7b_1pcters

#Task 8a: One way to measure polarity is to see how different some distribution are. 
# For both Obama and Romney, print the top-10 states based on their total AMOUNT of contribution
# Do you see lot of common states? 
t8a_top_10_states_obama = None
t8a_top_10_states_romney = None

print t8a_top_10_states_obama
print t8a_top_10_states_romney

#Task 8b: For both Obama and Romney, print the top-10 occupation based on their total AMOUNT of contribution
# Do you see lot of common occupation? 
t8b_top_10_occu_obama = None
t8b_top_10_occu_romney = None

print t8b_top_10_occu_obama
print t8b_top_10_occu_romney


#Task 8c: For both Obama and Romney, print the top-10 employers based on their total AMOUNT of contribution
# Do you see lot of common employers? 
t8c_top_10_emp_obama = None
t8c_top_10_emp_romney = None

print t8c_top_10_emp_obama
print t8c_top_10_emp_romney


#Harder
#Task 8d: Here is another way to compute polarization
# Find the top-1000 contributors based on their TOTAL contribution (to both Obama and Romney)
# For each of the top-1000 folks count the number of people who donated to both, to Obama only and to Romney only
t8d_top_1000_both = None
t8d_top_1000_BO_only = None
t8d_top_1000_MR_only = None

print t8d_top_1000_both, t8d_top_1000_BO_only, t8d_top_1000_MR_only

#Harder:
#Task 8e: Here is yet another way
# For each state, compute what fraction of amount went to Obama and what fraction went to Romney
# If there is no polarization, then both will get more or less equal amount. 
# If there is polarization, then the amount will skewed.
# Let us use a very crude measure to compute polarization
#    If X is what Obama got from a state and Y is what Romney got, compute the value of max(X,Y)/ (min(X,Y) + 1)
# For each state compute this value and sort the results in a descending order. 
# Do you see any pattern?

t8e_state_contr_ranked_by_polarity = None
print t8e_state_contr_ranked_by_polarity


#Harder:
#Task 8f: Repeat the above analysis for occupation
# However, instead of taking all occupations, let us only take the top-50 occupations based on TOTAL contributions made
# For each occupation compute this value and sort the results in a descending order and displ
# Do you see any pattern?

t8f_occu_contr_ranked_by_polarity = None
print t8f_occu_contr_ranked_by_polarity


#Harder:
#Task 8g: A known variable of polarization is based on where a person lives.
# At the risk of too much generalization, liberals dominate cities while conservations dominate rural areas
# Let us see if this holds in Texas.
# Texas is a known solid red (i.e. conservative) state.
# For each state in Texas, compute the polarity and order them by polarity.
# Do you see any pattern?
t8g_tx_city_contr_ranked_by_polarity = None
print t8g_tx_city_contr_ranked_by_polarity