# special IPython command to prepare the notebook for matplotlib %matplotlib inline #Array processing import numpy as np #Data analysis, wrangling and common exploratory operations import pandas as pd from pandas import Series, DataFrame #A sane way to get web data import requests #Packages for web scraping. No need to use both. Feel free to use one of them. from pattern import web from BeautifulSoup import BeautifulSoup #For visualization. Matplotlib for basic viz and seaborn for more stylish figures + statistical figures not in MPL. import matplotlib.pyplot as plt import seaborn as sns #For some of the date operations import datetime #Input: # url: URL of a wikipedia page # table_name: Name of the table to scrape #Output: # df is a Pandas data frame that contains a tabular representation of the table # The columns are named the same as the table columns # Each row of df corresponds to a row in the table def scraping_wikipedia_table(url, table_name): return None #Input: # url: URL of a Walmart results page for a search query in Movies department #Output: # df is a Pandas data frame that contains a tabular representation of the results # The df must have 9 columns that must have the same name and data type as described above # Each row of df corresponds to a movie in the results table def scraping_walmart_movies(url): return None #Input: dom - DOM of the books page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/books #Output: An array (Python list) of books listed in the profile page. # Note that this function requires a list as an output not a Pandas data frame def scraping_facebook_books(dom): return [] #Input: dom - DOM of the groups page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/groups #Output: A Pandas data frame with one row per group. # It must have three columns - 'Group Name', 'Number of Members', 'Group Description' # Note that all information of a group is in the same page (eg. https://www.facebook.com/zuck/groups) # You can collect all data from same page even if they are incomplete (such as group description) # Ensure that the column names as given above def scraping_facebook_groups(dom): return None #Input: dom - DOM of the music page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/music #Output: A Pandas data frame with one row per group. # It must have four columns # 'Name', 'Type' (eg. Musician/Band or Bands&Musicians) and 'Verified' (boolean: True/False), 'Profile Url' # Note that all information of a group is in the same page (eg. https://www.facebook.com/zuck/music) # Ensure that the column names as given above def scraping_facebook_music(dom): return None #Let us now make things little bit more harder. #In all previous cases, you only had to collect information from a single page. # But in reality, you have to collect information and integrate from multiple pages. # Let us try a simple version of such data integration #Input: dom - DOM of the music page corresponding to an FB account's profile. Eg, DOM of https://www.facebook.com/zuck/movies #Output: A Pandas data frame with one row per group. # It must have following columns - # 'Name', 'Type' (eg. Movie), 'Verified', 'Profile Url' - as before # 'Likes', 'Starring', 'Genre', 'Director', 'Movie URL' # The first three columns can be obtained from https://www.facebook.com/zuck/movies # Once you get the profile url, obtain the HTML of this url and use this content to obtain the last 5 column data # For example, Zuckerberg likes 'The Matrix' (great movie btw). # Then you get its profile url 'https://www.facebook.com/TheMatrixMovie?ref=profile' # Get the text of this url using requests package and parse information from the About tab # Ensure that the column names as given above def scraping_facebook_movies(dom): return None #read the csv file into a Pandas data frame fec_all = pd.read_csv('fec_2012_contribution_subset.csv', low_memory=False) #We will be doing party wise analysis later. So, we want to associate each candidate with their party parties = {'Bachmann, Michelle': 'Republican', 'Cain, Herman': 'Republican', 'Gingrich, Newt': 'Republican', 'Huntsman, Jon': 'Republican', 'Johnson, Gary Earl': 'Republican', 'McCotter, Thaddeus G': 'Republican', 'Obama, Barack': 'Democrat', 'Paul, Ron': 'Republican', 'Pawlenty, Timothy': 'Republican', 'Perry, Rick': 'Republican', "Roemer, Charles E. 'Buddy' III": 'Republican', 'Romney, Mitt': 'Republican', 'Santorum, Rick': 'Republican'} #create a new column called party that sets the value to the party of the candidate # The way this line works is as follows: # 1. fec_all.cand_nm gives a vector (or Series in Pandas terminology) # 2. For each row, the code looks up the candidate name to the dictionary parties # 3. If the name of the candidate (cand_nm) is in parties, it returns the value (i.e. Republican or Democrat) # 4. This whole thing is done for each row and you get another vector as output # 5. Finally, you create a new column in fec_all called 'party' and assign the vector you got to it. # 6. All in a single line :) fec_all['party'] = fec_all.cand_nm.map(parties) #ignore the refunds # Get the subset of dataset where contribution amount is positive fec_all = fec_all[fec_all.contb_receipt_amt > 0] #fec_all contains details about all presidential candidates. #fec contains the details about contributions to Barack Obama and Mitt Romney only #for the rest of the tasks, unless explicitly specified, work on the fec data frame. fec = fec_all[fec_all.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])] #Task 1a: print the details of the data frame. # Basically, this should contain information such as number of rows,columns, name of columns, #non-null values for each column etc # Hint: there is a Pandas function to do this #replace None with your code print "Task 1a: Details of FEC data frame are: \n", None #Task 1b: finding the number of rows and columns in the data frame. #Hint: find the property of the data frame that gives the number of rows and columns #replace None with your code t1b_num_rows = None t1b_num_cols = None print "Task 1b: #Rows=%s, #Columns=%s" % (t1b_num_rows, t1b_num_cols) #Task 1c: The only numeric data is 'contb_receipt_amt' which is the amount of contribution. # Print the descriptive details (min, max, quantiles etc) for 'contb_receipt_amt' #Hint: as above there is a direct pandas command for it. #replace None with your code print "Task 1c: Descriptive details of contb_receipt_amt is \n", None #Task 1d: Let us now print the number of unique values for few columns #Hint: Look for a Pandas function to do this. t1d_num_uniq_cand_id = None t1d_num_uniq_cand_nm = None t1d_num_uniq_contbr_city = None t1d_num_uniq_contbr_st = None print "Task 1d: #Uniq cand_id = ", t1d_num_uniq_cand_id print "Task 1d: #Uniq cand_num = ", t1d_num_uniq_cand_nm print "Task 1d: #Uniq contbr_city = ", t1d_num_uniq_contbr_city print "Task 1d: #Uniq contbr_st = ", t1d_num_uniq_contbr_st #Task 2a: Let us find out how much contributions did Obama and Romney made in this dataset # Remember, that this is not the complete amount as it excludes other sources like PACs, Super PACs and # spending by party committes. #Hint: use cand_nm field t2a_tot_amt_obama = None t2a_tot_amt_romney = None print "Task 2a: Total Contribution for Obama is %s and for Romney is %s" % (t2a_tot_amt_obama, t2a_tot_amt_romney) #Task 2b: How much contribution did folks from California, New York and Texas make totally (i.e. to both Obama and Romney). #use contbr_st field t2b_tot_amt_CA = None t2b_tot_amt_NY = None t2b_tot_amt_TX = None print "Task 2b: Total contributions from California is %s, New York is %s and Texas is %s" % (t2b_tot_amt_CA, t2b_tot_amt_NY, t2b_tot_amt_TX) #Task 2c: Let us now use multiple filtering criteria # How much money did folks from Texas made to BO and MR? # How much money did folks from UT Arlington made to BO and MR? t2c_tot_contr_tx_bo = None t2c_tot_contr_tx_mr = None t2c_tot_contr_uta_bo = None t2c_tot_contr_uta_mr = None print "Task 2c: From TX, BO got %s and MR got %s dollars" % (t2c_tot_contr_tx_bo, t2c_tot_contr_tx_mr) print "Task 2c: From UTA, BO got %s and MR got %s dollars" % (t2c_tot_contr_uta_bo, t2c_tot_contr_uta_mr) #Task 2d: How much did Engineers from Google gave to BO and MR. # This task is a bit tricky as there are many variations: eg, SOFTWARE ENGINEER vs ENGINEER and GOOGLE INC. vs GOOGLE t2d_tot_engr_goog_bo = None t2d_tot_engr_goog_mr = None print "Task 2d: From Google Engineers, BO got %s and MR got %s dollars" % (t2d_tot_engr_goog_bo, t2d_tot_engr_goog_mr) #Task 3a: For each state, print the total contribution they made to both candidates. t3a_state_contr_both = None print "Task 3a: Total contribution made to both candidates by each state are", t3a_state_contr_both #Task 3b: Now let us limit ourselves to TX. For each city in TX, print the total contribution made to both candidates t3b_tx_city_contr_both = None print "Task 3b: Total contribution made to both candidates by each city in TX are", t3b_tx_city_contr_both #Task 3c: Now let us zoom into Arlington, TX. For each zipcode in Arlington, print the total contribution made to both candidates t3c_arlington_contr_both = None print "Task 3c: Total contribution made to both candidates by each zipcode in Arlington are", t3c_arlington_contr_both #Task 4a: Print the number of contributors to Obama in each state. t4a_num_contr_obama_per_state = None print "Number of contributions to Obama in each state is ", t4a_num_contr_obama_per_state #Task 4b: Print the top-10 states (based on number of contributors) that contributed to Obama. # print both state name and number of contributors t4b_top10_obama_contr_states = None print "Top-10 states with most contributors to Obama are ", t4b_top10_obama_contr_states #Task 4c: Print the top-20 occupations that contributed overall (to both BO and MR) t4c_top20_contr_occupation = None print "Top-20 Occupations with most contributors are ", t4c_top20_contr_occupation #Task 4d: Print the top-10 Employers that contributed overall (to both BO and MR) t4d_top10_contr_employer_all = None print "Top-10 Employers with most contributors are ", t4d_top10_contr_employer_all #Task 5a: Draw a "horizontal" bar chart with one bar each for Obama and Romney with the value corresponding to total amount they raised. # Remember to make the bar chart into a horizontal bar chart #########################begin code for Task 5a #########################end code for Task 5a #Task 5b: Draw the "horizontal" bar chart of total NUMBER of contributions made per Candidate. # ie Candidate name vs number of contributions for that candidate #########################begin code for Task 5b #########################end code for Task 5b #Task 5c: Draw the "horizontal" bar chart of average contributions made per Candidate. # ie Candidate Name vs avg contribution #########################begin code for Task 5c #########################end code for Task 5c #Task 5d: Draw a "horizontal" bar chart that lists the top-10 states based on the TOTAL contribution they made to both candidates # For each state, draw two lines - one in blue for Obama and one in red for Romney # Display the proportion of the total contribution that came from the state. # For eg, if Obama made 1 billion and TX gave 100 million of it, the proportion is 10% # Remember to make the bar chart into a horizontal bar chart #########################begin code for Task 5d #########################end code for Task 5d #Task 5e: Now repeat the same process based on Occupation (again top-10) #########################begin code for Task 5e #########################end code for Task 5e #Task 5f: Now repeat the same process based on Employers (again top-10) #########################begin code for Task 5f #########################end code for Task 5f #Task 5g: Draw the histogram of total NUMBER of contributions made per each state. # X-axis : state, Y-axis : number of contribution from that state #########################begin code for Task 5g #########################end code for Task 5g #Task 5h: Draw a histogram of actual contributions made for Obama. Set bin size as 50 #X-axis: contribution amount bin, Y-axis: frequency #########################begin code for Task 5h #########################end code for Task 5h #Task 5i: Draw a histogram of actual contributions made for Romney. Set bin size as 50 #X-axis: contribution amount bin, Y-axis: frequency #########################begin code for Task 5i #########################end code for Task 5i #Harder #Task 5j: Draw a line chart showing how the campaign contribution of Obama and Romney varied every quarter #Use blue for Obama and red for Romney #This is a bit tricky because, you have to convert contribution date to quarter. #You can either do it on your own or see if Pandas has some function #########################begin code for Task 5j #########################end code for Task 5j #Harder #Task 5k: Draw a line chart showing the CUMULATIVE campaign contribution of Obama and Romney varied every quarter # In other words, if Obama made X, Y, Z in first, second and third quarters # then plot X for first quarter, X+Y for second quarter and X+Y+Z for third quarter. #Use blue for Obama and red for Romney #This is a bit tricky because, you have to convert contribution date to quarter. #You can either do it on your own or see if Pandas has some function #########################begin code for Task 5k #########################end code for Task 5k #Tasks 5l and 5m #Repeat 5j and 5k but do it for NUMBER of contributors #In other words, 5l plots the number of contributors for Obama and Romney, quarter over quarter #5m plots cumulative number of contributors quarter over quarter. #########################begin code for Task 5l #########################end code for Task 5l #########################begin code for Task 5m #########################end code for Task 5m #The following set of tasks are a bit tricky: # you need to use multiple commands to achieve. Specifically look at cut, groupby and unstack #Task 6a: Discretize the contributions of Obama and Romney based on the bins given below. # For example, if Obama got contributions such as 2, 6, 16, 18, 120, then he has # 0 contribution in (0,1], 2 contributions in (1,10], 2 contributions in (10, 100] and 1 contribution in (100, 1000] bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]) labels = None # set the variable labels to the output of pd.cut grouped = None #Group the data based on labels and candidate names #Replace None below in the print statement with grouped.size().unstack(0) . # If your code for labels and grouped is correct, this should print number of people in each bin per candidate print None #Task 6b: In Task 6a, we calculated the COUNT (i.e. the number of contributors in each bin) # This by itself is not sufficient. # In this task, let us compute the TOTAL AMOUNT in each bucket. # Specifically, compute for each candidate the total amount of contributions that each got in each bucket. # Continuing the example above, Obama's total contribution in each bucket is # 0 in (0,1], 8 in (1,10], 34 in (10, 100] and 120 in (100, 1000] #This could be done in 1 line from the variable grouped above t6b_bucket_sums = None print t6b_bucket_sums #Task 6c: Even this does not fully specify the disparity in the funding scenario. # To see this let us now compute the PROPORTION of total contribution in each bucket # This is called normalization and is a common operation. # Typically normalization is done over the same candidate. # But for the point I am trying to make, let us normalize within bucket. # For example, if obama made X in a bucket and Y in another bucket, # then normalization will give X/(X+Y) for Obama and Y/(X+Y) for Romney # This is also quite easy in Pandas and can be done in 1 line # The key idea is to realize that sum function takes an argument called axis # that does different things for axis=0 and axis=1 (figure out which does what) t6c_normed_bucket_sums = None print t6c_normed_bucket_sums #Once you have done this , uncomment the following line to a horizontal stacked bar chart #t6c_normed_bucket_sums[:-2].plot(kind='barh', stacked=True) #Task 6d: Let us go back and try to do the other analysis # Let us now try to see what PROPORTION of each candidate's amount came from each bucket. # This is the classical case of normalization. # Continuing the example above, Obama has made 0+8+34+120=162 # We can normalize it by diving each bucket by the total # For example, 0/162, 8/162, 34/162 and 120/162. # If you finished t6c, then this just a matter of playing with axis values. t6d_normed_bucket_sums = None print t6d_normed_bucket_sums #Once you have done this , uncomment the following line to a horizontal stacked bar chart #t6d_normed_bucket_sums.plot(kind='barh', stacked=True) #Task 7a: Write two functions: one for Obama and one for Romney that does the following: # Given a value of N (N can be 1, 2, 5, 10 etc), sort the contributions made to the candidate in decreasing order # Then find how much contribution the top-N% made # Then compute the fraction to the overall campaign collection # For example, if Obama collected 1 billion dollars and the top-1% gave 100 million then their contribution is 10% def t7a_contributions_by_top_N_pct_obama(N): return None def t7a_contributions_by_top_N_pct_romney(N): return None for N in [1, 2, 5, 10, 20]: print "N=%s, Obama proportion=%s and Romney proportion = %s" % (N, t7a_contributions_by_top_N_pct_obama(N), t7a_contributions_by_top_N_pct_romney(N)) #Task 7b: Now let us see who these people in 1% are # Compute the top-1% based on total contribution (ie. money they gave to Obama + Romney) # Now let us see some information about them. # For each of the folks in 1%, print the following: # name, state, occupation, employer, total amount they gave to Obama, total amount they gave to Romney t7b_1pcters = None print t7b_1pcters #Task 8a: One way to measure polarity is to see how different some distribution are. # For both Obama and Romney, print the top-10 states based on their total AMOUNT of contribution # Do you see lot of common states? t8a_top_10_states_obama = None t8a_top_10_states_romney = None print t8a_top_10_states_obama print t8a_top_10_states_romney #Task 8b: For both Obama and Romney, print the top-10 occupation based on their total AMOUNT of contribution # Do you see lot of common occupation? t8b_top_10_occu_obama = None t8b_top_10_occu_romney = None print t8b_top_10_occu_obama print t8b_top_10_occu_romney #Task 8c: For both Obama and Romney, print the top-10 employers based on their total AMOUNT of contribution # Do you see lot of common employers? t8c_top_10_emp_obama = None t8c_top_10_emp_romney = None print t8c_top_10_emp_obama print t8c_top_10_emp_romney #Harder #Task 8d: Here is another way to compute polarization # Find the top-1000 contributors based on their TOTAL contribution (to both Obama and Romney) # For each of the top-1000 folks count the number of people who donated to both, to Obama only and to Romney only t8d_top_1000_both = None t8d_top_1000_BO_only = None t8d_top_1000_MR_only = None print t8d_top_1000_both, t8d_top_1000_BO_only, t8d_top_1000_MR_only #Harder: #Task 8e: Here is yet another way # For each state, compute what fraction of amount went to Obama and what fraction went to Romney # If there is no polarization, then both will get more or less equal amount. # If there is polarization, then the amount will skewed. # Let us use a very crude measure to compute polarization # If X is what Obama got from a state and Y is what Romney got, compute the value of max(X,Y)/ (min(X,Y) + 1) # For each state compute this value and sort the results in a descending order. # Do you see any pattern? t8e_state_contr_ranked_by_polarity = None print t8e_state_contr_ranked_by_polarity #Harder: #Task 8f: Repeat the above analysis for occupation # However, instead of taking all occupations, let us only take the top-50 occupations based on TOTAL contributions made # For each occupation compute this value and sort the results in a descending order and displ # Do you see any pattern? t8f_occu_contr_ranked_by_polarity = None print t8f_occu_contr_ranked_by_polarity #Harder: #Task 8g: A known variable of polarization is based on where a person lives. # At the risk of too much generalization, liberals dominate cities while conservations dominate rural areas # Let us see if this holds in Texas. # Texas is a known solid red (i.e. conservative) state. # For each state in Texas, compute the polarity and order them by polarity. # Do you see any pattern? t8g_tx_city_contr_ranked_by_polarity = None print t8g_tx_city_contr_ranked_by_polarity