# For data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from __future__ import division
import datetime
#use to get info from web
import requests
#stringIO to work with csv
from StringIO import StringIO
doner_df = pd.read_csv('Election_Doner_Data.csv', index_col=False, dtype={
'cmte_id': object, 'cand_id': object, 'cand_nm': object, 'contbr_nm': object, 'contbr_city': object, 'contbr_st': object,
'contbr_zip': object, 'contbr_employer': object, 'contbr_occupation': object, 'contb_receipt_amt': float,
'contb_receipt_dt': object, 'receipt_desc': object,'memo_cd': object, 'memo_text': object, 'form_tp': object,
'file_num': int, 'tran_id': object, 'election_tp': object})
doner_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7440252 entries, 0 to 7440251 Data columns (total 18 columns): cmte_id object cand_id object cand_nm object contbr_nm object contbr_city object contbr_st object contbr_zip object contbr_employer object contbr_occupation object contb_receipt_amt float64 contb_receipt_dt object receipt_desc object memo_cd object memo_text object form_tp object file_num int32 tran_id object election_tp object dtypes: float64(1), int32(1), object(16) memory usage: 993.4+ MB
doner_df.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | tran_id | election_tp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00458844 | P60006723 | Rubio, Marco | BLUM, MAUREEN | WASHINGTON | 20 | DC | STRATEGIC COALITIONS & INITIATIVES LL | OUTREACH DIRECTOR | 175.0 | 15-MAR-16 | NaN | NaN | NaN | SA17A | 1082559 | SA17.1152124 | P2016 |
1 | C00458844 | P60006723 | Rubio, Marco | DODSON, MARK B. MR. | ATLANTA | 30 | GA | MORTGAGE CAPITAL ADVISORS | PRIVATE MORTGAGE BANKING | 25.0 | 16-MAR-16 | NaN | NaN | DEBT RETIREMENT | SA17A | 1082559 | SA17.1153048 | P2016 |
2 | C00458844 | P60006723 | Rubio, Marco | CHILDERS, WILLIAM | DPO | AE | 098309998 | DIPLOMAT | US GOVERNMENT | 100.0 | 20-FEB-16 | NaN | NaN | NaN | SA17A | 1056862 | SA17.1020839 | P2016 |
3 | C00458844 | P60006723 | Rubio, Marco | RUCINSKI, ROBERT | APO | AE | 090960009 | US ARMY | PHYSICIAN | 200.0 | 10-MAR-16 | NaN | NaN | NaN | SA17A | 1082559 | SA17.1078677 | P2016 |
4 | C00458844 | P60006723 | Rubio, Marco | RUCINSKI, ROBERT | APO | AE | 090960009 | US ARMY | PHYSICIAN | 100.0 | 08-MAR-16 | NaN | NaN | NaN | SA17A | 1082559 | SA17.1074981 | P2016 |
doner_df['contb_receipt_amt'].value_counts()
25.00 1051165 50.00 885354 100.00 785577 10.00 637262 5.00 434221 15.00 327665 27.00 313461 250.00 278865 35.00 150087 20.00 142855 200.00 133909 40.00 125064 19.00 120481 500.00 118679 80.00 103486 8.00 102787 75.00 95509 2700.00 90866 28.00 85847 1000.00 83443 3.00 76709 38.00 69111 1.00 57444 30.00 55169 150.00 50352 37.00 39914 400.00 34955 16.00 27659 7.00 25740 300.00 23758 ... 227.93 1 -1564.34 1 -8.70 1 102.26 1 102.24 1 9.66 1 420.48 1 183.46 1 225.57 1 1730.00 1 960.56 1 225.82 1 -975.44 1 -5700.00 1 224.93 1 -133.55 1 -79.05 1 33.46 1 1732.00 1 -530.89 1 -742.87 1 225.43 1 91.46 1 426.77 1 97.22 1 91.21 1 1735.00 1 421.98 1 21.08 1 -106.13 1 Name: contb_receipt_amt, dtype: int64
# removing negitives from our main dataframe
doner_df = doner_df[doner_df.contb_receipt_amt > 0]
# Get a list of all canidates
canidates = doner_df.cand_nm.unique()
canidates
array(['Rubio, Marco', 'Santorum, Richard J.', 'Perry, James R. (Rick)', 'Carson, Benjamin S.', "Cruz, Rafael Edward 'Ted'", 'Paul, Rand', 'Clinton, Hillary Rodham', 'Sanders, Bernard', 'Fiorina, Carly', 'Huckabee, Mike', 'Pataki, George E.', "O'Malley, Martin Joseph", 'Graham, Lindsey O.', 'Bush, Jeb', 'Trump, Donald J.', 'Jindal, Bobby', 'Christie, Christopher J.', 'Walker, Scott', 'Stein, Jill', 'Webb, James Henry Jr.', 'Kasich, John R.', 'Gilmore, James S III', 'Lessig, Lawrence', 'Johnson, Gary', 'McMullin, Evan'], dtype=object)
# dictionary of party affiliation
party_map = {'Rubio, Marco': 'Republican',
'Santorum, Richard J.': 'Republican',
'Perry, James R. (Rick)': 'Republican',
'Carson, Benjamin S.': 'Republican',
"Cruz, Rafael Edward 'Ted'": 'Republican',
'Paul, Rand': 'Republican',
'Clinton, Hillary Rodham': 'Democrat',
'Sanders, Bernard': 'Democrat',
'Fiorina, Carly': 'Republican',
'Huckabee, Mike': 'Republican',
'Pataki, George E.': 'Republican',
"O'Malley, Martin Joseph": 'Democrat',
'Graham, Lindsey O.': 'Republican',
'Bush, Jeb': 'Republican',
'Trump, Donald J.': 'Republican',
'Jindal, Bobby': 'Republican',
'Christie, Christopher J.': 'Republican',
'Walker, Scott': 'Republican',
'Stein, Jill': '3rd Party',
'Webb, James Henry Jr.': 'Democrat',
'Kasich, John R.': 'Republican',
'Gilmore, James S III': 'Republican',
'Lessig, Lawrence': 'Democrat',
'Johnson, Gary': '3rd Party',
'McMullin, Evan': '3rd Party'}
# creating a party column and mapping party to canidate
doner_df['Party'] = doner_df.cand_nm.map(party_map)
doner_df.head(1)
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | tran_id | election_tp | Party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00458844 | P60006723 | Rubio, Marco | BLUM, MAUREEN | WASHINGTON | 20 | DC | STRATEGIC COALITIONS & INITIATIVES LL | OUTREACH DIRECTOR | 175.0 | 15-MAR-16 | NaN | NaN | NaN | SA17A | 1082559 | SA17.1152124 | P2016 | Republican |
# A count of how many donations were to each campaign
doner_df.groupby("cand_nm")["contb_receipt_amt"].count().sort_values()
cand_nm Gilmore, James S III 84 Pataki, George E. 343 Jindal, Bobby 761 Webb, James Henry Jr. 800 Perry, James R. (Rick) 875 Lessig, Lawrence 1337 Santorum, Richard J. 1675 McMullin, Evan 2569 Graham, Lindsey O. 3926 O'Malley, Martin Joseph 5250 Christie, Christopher J. 5838 Huckabee, Mike 6360 Walker, Scott 6656 Stein, Jill 11014 Johnson, Gary 13400 Kasich, John R. 25212 Fiorina, Carly 27536 Bush, Jeb 28038 Paul, Rand 32196 Rubio, Marco 99259 Carson, Benjamin S. 244904 Cruz, Rafael Edward 'Ted' 541088 Trump, Donald J. 762871 Sanders, Bernard 2047595 Clinton, Hillary Rodham 3471128 Name: contb_receipt_amt, dtype: int64
top_doner = doner_df[['contbr_nm','contb_receipt_amt']].copy()
top_doner = top_doner.sort_values(by='contb_receipt_amt',ascending=False)
top_doner.head(10)
contbr_nm | contb_receipt_amt | |
---|---|---|
3239378 | HILLARY VICTORY FUND - UNITEMIZED | 12777705.58 |
3274161 | HILLARY VICTORY FUND - UNITEMIZED | 7402361.45 |
3224394 | HILLARY VICTORY FUND - UNITEMIZED | 6358481.89 |
3284270 | HILLARY VICTORY FUND - UNITEMIZED | 4904860.51 |
3379521 | HILLARY VICTORY FUND - UNITEMIZED | 4575438.56 |
3447482 | HILLARY VICTORY FUND - UNITEMIZED | 4560967.14 |
3484760 | HILLARY VICTORY FUND - UNITEMIZED | 4126693.23 |
3364742 | HILLARY VICTORY FUND - UNITEMIZED | 3686373.26 |
3369437 | HILLARY VICTORY FUND - UNITEMIZED | 3600489.08 |
3384894 | HILLARY VICTORY FUND - UNITEMIZED | 2976430.32 |
com_don = top_doner[top_doner < 250]
com_don.hist(bins=100)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000007EB6BF28>]], dtype=object)
# The Total the canidate raised
cand_amount = doner_df.groupby('cand_nm')['contb_receipt_amt'].sum().sort_values()
i = 0
for don in cand_amount:
print '%s raised %.0f' %(cand_amount.index[i],don)
i += 1
Gilmore, James S III raised 105561 Webb, James Henry Jr. raised 439246 Pataki, George E. raised 507349 McMullin, Evan raised 553408 Lessig, Lawrence raised 624220 Santorum, Richard J. raised 1139075 Perry, James R. (Rick) raised 1189868 Jindal, Bobby raised 1275393 Stein, Jill raised 2493803 Huckabee, Mike raised 2499978 Johnson, Gary raised 3584074 O'Malley, Martin Joseph raised 4064948 Graham, Lindsey O. raised 4632862 Paul, Rand raised 6314863 Walker, Scott raised 6601235 Fiorina, Carly raised 6693845 Christie, Christopher J. raised 8214280 Kasich, John R. raised 14752678 Carson, Benjamin S. raised 29014000 Bush, Jeb raised 33773733 Rubio, Marco raised 40542078 Cruz, Rafael Edward 'Ted' raised 69752747 Sanders, Bernard raised 97572014 Trump, Donald J. raised 127578631 Clinton, Hillary Rodham raised 521951750
cand_amount.plot(kind='bar',figsize=(14,6))
<matplotlib.axes._subplots.AxesSubplot at 0x15902eb8>
# Average
avg_donations = doner_df.groupby("cand_nm").mean().sort_values(by="contb_receipt_amt")
avg_donations["contb_receipt_amt"].plot(kind="bar",figsize=(12,5))
<matplotlib.axes._subplots.AxesSubplot at 0x23b626a0>
doner_df.groupby('Party')['contb_receipt_amt'].sum().sort_values().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a4f08d0>
# Pivot table much like in excel
job_pt = doner_df.pivot_table('contb_receipt_amt', index='contbr_occupation', columns = 'Party', aggfunc='sum')
job_pt["Total"] = job_pt.sum(axis=1)
job_pt.head()
Party | 3rd Party | Democrat | Republican | Total |
---|---|---|---|---|
contbr_occupation | ||||
ADMINISTRATIVE ASSISTANT | NaN | 150.00 | NaN | 150.00 |
ARTIST | NaN | 100.00 | NaN | 100.00 |
ATTORNEY | NaN | 2100.50 | NaN | 2100.50 |
CAREER ADVISOR | NaN | 50.00 | NaN | 50.00 |
CERTIFIED REGISTERED NURSE ANESTHETIS | NaN | 2927.72 | NaN | 2927.72 |
job_pt = job_pt.sort_values(by="Total",ascending=True)
job_pt.info()
<class 'pandas.core.frame.DataFrame'> Index: 127268 entries, SERVICE SALES REPRESENTATIVE to RETIRED Data columns (total 4 columns): 3rd Party 3290 non-null float64 Democrat 79117 non-null float64 Republican 61340 non-null float64 Total 127268 non-null float64 dtypes: float64(4) memory usage: 4.9+ MB
job_pt = job_pt[job_pt['Total'] > 2000000]
# 44 jobs are over $2,000,000
job_pt.shape
(44, 4)
job_pt
Party | 3rd Party | Democrat | Republican | Total |
---|---|---|---|---|
contbr_occupation | ||||
INVESTMENTS | 18884.75 | 6.567254e+05 | 1.353727e+06 | 2.029337e+06 |
EDUCATOR | 9771.80 | 1.883753e+06 | 1.672757e+05 | 2.060800e+06 |
ENTREPRENEUR | 17053.00 | 1.285851e+06 | 8.152827e+05 | 2.118187e+06 |
BANKER | 6304.70 | 1.098698e+06 | 1.014728e+06 | 2.119731e+06 |
ARCHITECT | 12008.49 | 1.735440e+06 | 4.588543e+05 | 2.206302e+06 |
FARMER | 10176.55 | 5.318559e+05 | 1.692710e+06 | 2.234742e+06 |
PSYCHOLOGIST | 12250.00 | 2.117766e+06 | 1.345607e+05 | 2.264577e+06 |
CPA | 21170.95 | 1.128238e+06 | 1.308780e+06 | 2.458188e+06 |
MARKETING | 13616.45 | 1.986465e+06 | 5.504764e+05 | 2.550558e+06 |
ACCOUNTANT | 25204.50 | 1.545158e+06 | 1.054324e+06 | 2.624687e+06 |
REALTOR | 9785.00 | 1.679468e+06 | 1.132490e+06 | 2.821743e+06 |
SELF EMPLOYED | 23545.75 | 1.340535e+06 | 1.497272e+06 | 2.861353e+06 |
DIRECTOR | 19953.60 | 2.282283e+06 | 6.664497e+05 | 2.968686e+06 |
CONDUIT TOTAL LISTED IN AGG. FIELD | NaN | 3.096980e+06 | NaN | 3.096980e+06 |
VICE PRESIDENT | 6029.45 | 2.034357e+06 | 1.058275e+06 | 3.098661e+06 |
FINANCE | 41934.68 | 1.922821e+06 | 1.302961e+06 | 3.267716e+06 |
CHAIRMAN | 13738.40 | 1.093243e+06 | 2.232494e+06 | 3.339475e+06 |
ARTIST | 30383.00 | 3.060754e+06 | 2.698506e+05 | 3.360987e+06 |
PARTNER | 3445.00 | 2.251080e+06 | 1.190405e+06 | 3.444931e+06 |
SELF-EMPLOYED | 24622.00 | 3.628544e+05 | 3.671933e+06 | 4.059410e+06 |
SOFTWARE ENGINEER | 155812.17 | 3.941078e+06 | 4.105275e+05 | 4.507417e+06 |
STUDENT | 42628.07 | 3.709696e+06 | 9.290034e+05 | 4.681327e+06 |
WRITER | 58619.96 | 4.696448e+06 | 3.928428e+05 | 5.147911e+06 |
SALES | 45785.38 | 2.540823e+06 | 3.243667e+06 | 5.830276e+06 |
BUSINESS OWNER | 42160.66 | 1.870745e+06 | 4.057967e+06 | 5.970873e+06 |
INVESTOR | 69264.00 | 2.542662e+06 | 3.401620e+06 | 6.013546e+06 |
MANAGER | 82777.36 | 4.113470e+06 | 2.902017e+06 | 7.098265e+06 |
REAL ESTATE | 41567.55 | 3.084272e+06 | 4.097864e+06 | 7.223703e+06 |
TEACHER | 70832.40 | 6.771090e+06 | 9.592108e+05 | 7.801134e+06 |
ENGINEER | 151310.23 | 4.909299e+06 | 3.634184e+06 | 8.694793e+06 |
PROFESSOR | 64964.25 | 9.683826e+06 | 5.585040e+05 | 1.030729e+07 |
OWNER | 47775.71 | 4.355606e+06 | 6.240037e+06 | 1.064342e+07 |
EXECUTIVE | 62106.34 | 4.995736e+06 | 6.227872e+06 | 1.128571e+07 |
PRESIDENT | 53596.45 | 6.048134e+06 | 7.937873e+06 | 1.403960e+07 |
LAWYER | 48255.73 | 1.249978e+07 | 2.281451e+06 | 1.482948e+07 |
INFORMATION REQUESTED PER BEST EFFORTS | NaN | NaN | 1.529517e+07 | 1.529517e+07 |
CONSULTANT | 103017.70 | 1.240737e+07 | 3.875750e+06 | 1.638614e+07 |
CEO | 75970.60 | 8.035269e+06 | 8.839239e+06 | 1.695048e+07 |
PHYSICIAN | 151211.88 | 1.343999e+07 | 6.999147e+06 | 2.059035e+07 |
HOMEMAKER | 117034.51 | 1.237521e+07 | 1.659512e+07 | 2.908736e+07 |
NOT EMPLOYED | 1315.00 | 3.116304e+07 | 2.171659e+05 | 3.138152e+07 |
INFORMATION REQUESTED | 9527.50 | 1.980982e+07 | 1.990326e+07 | 3.972261e+07 |
ATTORNEY | 166154.85 | 4.129244e+07 | 1.022316e+07 | 5.168176e+07 |
RETIRED | 835991.40 | 7.735237e+07 | 8.732449e+07 | 1.655129e+08 |
# remove these rows
job_pt.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED',
'CONDUIT TOTAL LISTED IN AGG. FIELD'],axis=0,inplace=True)
#combine like rows
job_pt.loc['SELF-EMPLOYED'] = job_pt.loc['SELF-EMPLOYED'] + job_pt.loc['SELF EMPLOYED']
#remove self employed without dash
job_pt.drop('SELF EMPLOYED',inplace=True)
job_pt[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
<matplotlib.axes._subplots.AxesSubplot at 0x2caa39b0>
# Pivot table much like in excel
job_pt = doner_df.pivot_table('contb_receipt_amt', index='contbr_occupation', columns = 'Party', aggfunc='count')
job_pt["Total"] = job_pt.sum(axis=1)
job_pt.head()
Party | 3rd Party | Democrat | Republican | Total |
---|---|---|---|---|
contbr_occupation | ||||
ADMINISTRATIVE ASSISTANT | NaN | 3.0 | NaN | 3.0 |
ARTIST | NaN | 1.0 | NaN | 1.0 |
ATTORNEY | NaN | 17.0 | NaN | 17.0 |
CAREER ADVISOR | NaN | 1.0 | NaN | 1.0 |
CERTIFIED REGISTERED NURSE ANESTHETIS | NaN | 44.0 | NaN | 44.0 |
job_pt = job_pt.sort_values(by="Total",ascending=True)
job_pt = job_pt[job_pt['Total'] > 15000]
job_pt.shape
(46, 4)
job_pt.drop(['DISABLED', 'INFORMATION REQUESTED PER BEST EFFORTS', 'RETIRED', 'INFORMATION REQUESTED',
'CONDUIT TOTAL LISTED IN AGG. FIELD'],axis=0,inplace=True)
job_pt.loc['SELF-EMPLOYED'] = job_pt.loc['SELF-EMPLOYED'] + job_pt.loc['SELF EMPLOYED']
job_pt.loc['REGISTERED NURSE'] = job_pt.loc['REGISTERED NURSE'] + job_pt.loc['RN']
job_pt.loc['BUSINESS OWNER'] = job_pt.loc['BUSINESS OWNER'] + job_pt.loc['OWNER']
job_pt.drop('SELF EMPLOYED', inplace=True)
job_pt.drop('RN', inplace=True)
job_pt.drop('OWNER', inplace=True)
job_pt[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
<matplotlib.axes._subplots.AxesSubplot at 0x34b38be0>
employer_df = doner_df.pivot_table('contb_receipt_amt',index='contbr_employer',columns = 'Party',aggfunc='sum')
employer_df.shape
(398017, 3)
# once again graphing 398,017 different companies will not only look horrible, but take forever
employer_df = employer_df[employer_df.sum(1) > 350000]
employer_df.shape
(44, 3)
# Thats better
employer_df
Party | 3rd Party | Democrat | Republican |
---|---|---|---|
contbr_employer | |||
APPLE INC. | 2390.27 | 6.157492e+05 | 1.691403e+04 |
AT&T | 1599.00 | 3.930897e+05 | 8.445015e+04 |
BANK OF AMERICA | 989.00 | 3.137584e+05 | 8.756927e+04 |
CIVIC VOLUNTEER | NaN | NaN | 4.270666e+05 |
COLUMBIA UNIVERSITY | 540.00 | 5.088942e+05 | 1.585360e+04 |
COVINGTON & BURLING LLP | 1000.00 | 3.371457e+05 | 2.469000e+04 |
DLA PIPER LLP | NaN | 3.859019e+05 | 5.050000e+03 |
4100.00 | 4.880039e+05 | 2.884904e+04 | |
GOLDMAN SACHS | NaN | 2.737989e+05 | 4.441873e+05 |
33963.00 | 1.747589e+06 | 4.086180e+04 | |
HARVARD UNIVERSITY | 779.00 | 6.090910e+05 | 5.147250e+03 |
HOMEMAKER | 70712.76 | 1.032269e+05 | 1.557000e+07 |
IBM | 13394.80 | 4.785958e+05 | 9.277686e+04 |
INFORMATION REQUESTED | 9527.50 | 1.915960e+07 | 2.015585e+07 |
INFORMATION REQUESTED PER BEST EFFORTS | NaN | NaN | 1.631469e+07 |
JPMORGAN CHASE | NaN | 3.761825e+05 | 2.130020e+04 |
KAISER PERMANENTE | 5051.00 | 4.486548e+05 | 5.095701e+04 |
KIRKLAND & ELLIS LLP | NaN | 3.053693e+05 | 7.715636e+04 |
LATHAM & WATKINS LLP | NaN | 3.358406e+05 | 1.774300e+04 |
MICROSOFT | 8700.00 | 8.683209e+05 | 8.708010e+04 |
MORGAN & MORGAN | NaN | 5.768052e+05 | 1.900000e+03 |
MORGAN STANLEY | 3157.00 | 4.349717e+05 | 1.980259e+05 |
NEW YORK UNIVERSITY | 3581.50 | 4.238838e+05 | 7.488000e+03 |
NONE | 87436.75 | 1.657044e+07 | 2.140423e+06 |
NOT EMPLOYED | 2275.00 | 1.817125e+07 | 2.450286e+05 |
PAUL WEISS RIFKIND WHARTON & GARRISON | NaN | 3.452812e+05 | 6.200000e+03 |
RETIRED | 853404.11 | 1.778874e+07 | 8.684480e+07 |
SELF | 516475.77 | 6.375224e+06 | 4.304496e+06 |
SELF EMPLOYED | 316031.33 | 7.194902e+06 | 9.967645e+06 |
SELF-EMPLOYED | 288872.18 | 6.057527e+07 | 3.011407e+07 |
SIDLEY AUSTIN LLP | NaN | 3.337994e+05 | 5.223000e+04 |
SKADDEN ARPS SLATE MEAGHER & FLOM LLP | NaN | 3.655658e+05 | 4.201300e+02 |
STANFORD UNIVERSITY | 3890.00 | 6.700374e+05 | 2.810121e+04 |
STATE OF CALIFORNIA | 900.00 | 3.908809e+05 | 5.579683e+04 |
STUDENT | 14591.70 | 1.132440e+05 | 6.591907e+05 |
TWICE REQUESTED NOT YET RECEIVED | 372793.06 | NaN | NaN |
U.S. DEPARTMENT OF JUSTICE | NaN | 3.577409e+05 | 1.030258e+04 |
U.S. DEPARTMENT OF STATE | 1275.00 | 5.311461e+05 | 4.081750e+03 |
U.S. GOVERNMENT | NaN | 6.733251e+05 | 3.207180e+04 |
UCLA | 1000.00 | 3.321899e+05 | 2.331297e+04 |
UNIVERSITY OF CALIFORNIA | 1550.00 | 3.927622e+05 | 1.969000e+03 |
UNIVERSITY OF MICHIGAN | 3395.00 | 3.789262e+05 | 1.661126e+04 |
WELLS FARGO | 9045.00 | 3.662068e+05 | 1.049981e+05 |
YALE UNIVERSITY | 2144.75 | 3.903887e+05 | 5.457600e+03 |
employer_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS', 'INFORMATION REQUESTED', 'RETIRED',
'TWICE REQUESTED NOT YET RECEIVED'],axis=0,inplace=True)
# combine like rows
employer_df.loc['SELF-EMPLOYED'] = employer_df.loc['SELF-EMPLOYED'] + employer_df.loc['SELF EMPLOYED'] + employer_df.loc['SELF']
employer_df.loc['NOT EMPLOYED'] = employer_df.loc['NOT EMPLOYED'] + employer_df.loc['NONE']
# remove self employed without dash
employer_df.drop('SELF EMPLOYED',inplace=True)
employer_df.drop('SELF',inplace=True)
employer_df.drop('NONE',inplace=True)
# These were far to large to allow us to see anything else
employer_df.drop('HOMEMAKER',inplace=True)
employer_df.drop('NOT EMPLOYED',inplace=True)
employer_df.drop('SELF-EMPLOYED',inplace=True)
employer_df["Total"] = employer_df.sum(axis=1)
employer_df = employer_df.sort_values(by="Total",ascending=True)
employer_df.head()
Party | 3rd Party | Democrat | Republican | Total |
---|---|---|---|---|
contbr_employer | ||||
PAUL WEISS RIFKIND WHARTON & GARRISON | NaN | 345281.25 | 6200.00 | 351481.25 |
LATHAM & WATKINS LLP | NaN | 335840.62 | 17743.00 | 353583.62 |
UCLA | 1000.0 | 332189.88 | 23312.97 | 356502.85 |
COVINGTON & BURLING LLP | 1000.0 | 337145.72 | 24690.00 | 362835.72 |
SKADDEN ARPS SLATE MEAGHER & FLOM LLP | NaN | 365565.76 | 420.13 | 365985.89 |
employer_df[['Democrat','Republican']].plot(kind='barh',figsize=(10,16),cmap='bwr')
<matplotlib.axes._subplots.AxesSubplot at 0x417e55f8>
state_df = doner_df.pivot_table('contb_receipt_amt',index='contbr_st',columns = 'Party',aggfunc='sum')
state_df.head()
Party | 3rd Party | Democrat | Republican |
---|---|---|---|
contbr_st | |||
20 | NaN | NaN | 175.00 |
30 | NaN | NaN | 25.00 |
AA | 250.0 | 42749.35 | 2474.07 |
AB | NaN | 4493.10 | NaN |
AE | 5987.7 | 172202.35 | 37958.55 |
state_df.drop(['20','30','FF','FR','GU','IS','LO','MP','ON','PR','SI','VI','XX','AS','AU','BC','BR','EN','JA',
'LE','SW','U*','UK','AA','BU','PU','SA','TE','ZZ','AB','BA','BE','CH','DU','E','EU','FM','GR','HE','KE','LI','MB',
'NB','NL','NO','NS','OS','QC','SH','SK','SO','ST','TO','W.','GE','N.','C','PW','AE','AM','AP','QB'],inplace=True)
state_df["Total"] = state_df.sum(axis=1)
state_df = state_df.sort_values(by="Total",ascending=True)
state_df.head()
Party | 3rd Party | Democrat | Republican | Total |
---|---|---|---|---|
contbr_st | ||||
ND | 2875.00 | 327070.57 | 571892.83 | 901838.40 |
SD | 3504.00 | 521135.46 | 708679.05 | 1233318.51 |
DE | 6217.00 | 977554.76 | 604862.65 | 1588634.41 |
AK | 22036.86 | 972811.97 | 810857.98 | 1805706.81 |
WY | 14072.50 | 572265.52 | 1224614.66 | 1810952.68 |
state_df[['Democrat','Republican']].plot(kind='barh',figsize=(15,16),cmap='bwr')
<matplotlib.axes._subplots.AxesSubplot at 0x488137b8>
RI_df = doner_df[doner_df.contbr_st == 'RI']
RI_df.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | tran_id | election_tp | Party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
83179 | C00458844 | P60006723 | Rubio, Marco | BLOUNT, F. NELSON | BARRINGTON | RI | 028061841 | BLOUNT FINE FOODS | PRESIDENT | 250.0 | 02-FEB-16 | NaN | NaN | NaN | SA17A | 1056862 | SA17.969678 | P2016 | Republican |
83180 | C00458844 | P60006723 | Rubio, Marco | COOPER, DAVID | NEWPORT | RI | 028402004 | U.S. NAVAL WAR COLLEGE | PROFESSOR | 100.0 | 21-FEB-16 | NaN | NaN | NaN | SA17A | 1056862 | SA17.1026442 | P2016 | Republican |
83181 | C00458844 | P60006723 | Rubio, Marco | DEYOE, RONALD A. CAPT. | NEWPORT | RI | 028401528 | RETIRED | RETIRED | 50.0 | 18-FEB-16 | NaN | NaN | NaN | SA17A | 1056862 | SA17.1018752 | P2016 | Republican |
83182 | C00458844 | P60006723 | Rubio, Marco | FOGG, DAVID | WICKFORD | RI | 028525103 | AVC | SEMI-RETIRED | 25.0 | 14-FEB-16 | NaN | NaN | NaN | SA17A | 1056862 | SA17.1010821 | P2016 | Republican |
83183 | C00458844 | P60006723 | Rubio, Marco | FOGG, DAVID | WICKFORD | RI | 028525103 | AVC | SEMI-RETIRED | 25.0 | 18-FEB-16 | NaN | NaN | NaN | SA17A | 1056862 | SA17.1017099 | P2016 | Republican |
RI_df.shape
(25918, 19)
RI_df.contbr_city.unique()
array(['BARRINGTON', 'NEWPORT', 'WICKFORD', 'WAKEFIELD', 'PORTSMOUTH', 'WEST WARWICK', 'NARRAGANSETT', 'EAST GREENWICH', 'PROVIDENCE', 'SAUNDERSTOWN', 'LITTLE COMPTON', 'CHARLESTOWN', 'EAST PROVIDENCE', 'WARWICK', 'MIDDLETOWN', 'TIVERTON', 'KINGSTON', 'SMITHFIELD', 'WOONSOCKET', 'WESTERLY', 'RUMFORD', 'NORTH PROVIDENCE', 'NORTH SMITHFIELD', 'FOSTER', 'COVENTRY', 'EXETER', 'LINCOLN', 'HOPE VALLEY', 'CUMBERLAND', 'CHEPACHET', 'ASHAWAY', 'PAWTUCKET', 'NORTH KINGSTOWN', 'GREENE', 'BRISTOL', 'N SCITUATE', 'GREENVILLE', 'NORTH SCITUATE', 'WYOMING', 'WCKFORD', 'WEST GREENWICH', 'WEST KINGSTON', 'CRANSTON', 'RIVERSIDE', 'N SMITHFIELD', 'WEST WARWWICK', 'E. GREENWICH', 'JAMESTOWN', 'N KINGSTOWN', 'WARREN', 'BLOCK ISLAND', 'JOHNSTON', 'HOPKINTON', 'HOPE', 'SOUTH KINGSTOWN', 'SCITUATE', 'E GREENWICH', 'CENTRAL FALLS', 'PASCOAG', 'CAROLINA', 'MANVILLE', 'RICHMOND', 'ADAMSVILLE', 'PRUDENCE ISL', 'MAPLEVILLE', 'SLATERSVILLE', 'PEACE DALE', 'WOOD RIVER JUNCTION', 'FORESTDALE', 'HARRISVILLE', 'ALBION', 'E PROVIDENCE', 'PARIS', 'N PROVIDENCE', 'S KINGSTOWN', 'HARMONY', 'ROCKVILLE', 'SLOCUM', 'SHANNOCK', 'CLAYVILLE', 'NEW SHOREHAM', 'W GREENWICH', 'INFO REQUESTED', 'BRADFORD', 'L. COMPTON', 'NARRAGANSET', 'HERMITAGE', 'NORTHPROVIDENCE', 'N. SCITUATE', 'BARRINGTION', 'SPEARFISH', 'VACAVILLE', 'MIDDLEFIELD', 'POOLESVILLE', 'MADISON', 'WEDT KINGSTON', 'SMITHFIELDSMITHFIELD', 'NORTH KING', 'NO. PROVIDENCE', 'WSET GREENWICH', 'SCOTTSDALE', 'LAKE FOREST', 'REDMOND', 'SAUNDERSTROWN', 'PITTSBURGH', 'PORT ST LUCIE', 'HONOMU', 'FTWORTH', 'CLINTON', 'N. KINGSTOWN', 'N. PROVIDENCE', 'PREONZO', 'N.PROV.', 'MIDDLETWON', 'LINCON', 'EAST GREENWICH, RI 02818', 'CHAPACHET', 'CHAMBERSBURG', 'OAKLAND', 'S. KINGSTOWN', 'SPARTANBURG', 'POTTSTOWN', 'FAIR LAWN', 'BURRILLVILLE', 'WARIWCK', 'GLENDALE', 'WARWICK. RHODE ISLAND', 'FAIR HAVEN', 'CHARLESTON', 'N.SCITUATE', 'WAWICK', 'CLAREMORE', 'SALT LAKE CITY', 'NO.KINGSTOWN', 'NARRAGONSETT', 'PERU', 'LITTLE COMPTO', 'KIAWAH ISLAND', 'ESAT PROVIDENCE'], dtype=object)
RI_df['contbr_city'].value_counts()
PROVIDENCE 5428 CRANSTON 1916 WARWICK 1548 NEWPORT 1286 NORTH KINGSTOWN 1084 BARRINGTON 1002 WAKEFIELD 917 PAWTUCKET 854 EAST GREENWICH 773 BRISTOL 737 RIVERSIDE 597 CUMBERLAND 553 WESTERLY 551 NARRAGANSETT 507 PORTSMOUTH 507 WEST WARWICK 455 JAMESTOWN 435 MIDDLETOWN 398 COVENTRY 392 WEST KINGSTON 382 NORTH PROVIDENCE 362 LINCOLN 355 WARREN 291 CHARLESTOWN 288 TIVERTON 286 BLOCK ISLAND 284 WOONSOCKET 264 LITTLE COMPTON 252 SAUNDERSTOWN 233 JOHNSTON 202 ... FTWORTH 1 S. KINGSTOWN 1 CHAPACHET 1 PORT ST LUCIE 1 N. PROVIDENCE 1 EAST GREENWICH, RI 02818 1 NARRAGONSETT 1 LINCON 1 L. COMPTON 1 NORTH KING 1 PERU 1 WARWICK. RHODE ISLAND 1 CLAREMORE 1 LAKE FOREST 1 N.PROV. 1 NO.KINGSTOWN 1 WAWICK 1 WSET GREENWICH 1 FAIR HAVEN 1 REDMOND 1 SALT LAKE CITY 1 CHARLESTON 1 SCOTTSDALE 1 BURRILLVILLE 1 CHAMBERSBURG 1 MIDDLETWON 1 WEDT KINGSTON 1 SMITHFIELDSMITHFIELD 1 BARRINGTION 1 MADISON 1 Name: contbr_city, dtype: int64
RI_df = RI_df[~RI_df['contbr_city'].isin(['CHAMBERSBURG', 'CLINTON', 'FAIR HAVEN', 'CLAREMORE', 'FTWORTH', 'HERMITAGE', 'HONOMU',
'KIAWAH ISLAND', 'LAKE FOREST', 'MADISON', 'MIDDLEFIELD', 'PARIS', 'PERU', 'PITTSBURGH', 'POOLESVILLE', 'PORT ST LUCIE',
'POTTSTOWN', 'PREONZO', 'SALT LAKE CITY', 'SCOTTSDALE', 'SPARTANBURG', 'SPEARFISH', 'VACAVILLE', 'INFO REQUESTED',
'REDMOND'])]
RI_df = RI_df.replace(['BARRINGTION'],'BARRINGTON')
RI_df = RI_df.replace(['GLENDALE','HARRISVILLE','MAPLEVILLE','OAKLAND','PASCOAG'],'BURRILLVILLE')
RI_df = RI_df.replace(['CAROLINA','CHARLESTON','CHARLESTOWN'],'CHARLESTOWN')
RI_df = RI_df.replace(['GREENE'],'COVENTRY')
RI_df = RI_df.replace(['E GREENWICH','E. GREENWICH','EAST GREENWICH, RI 02818'],'EAST GREENWICH')
RI_df = RI_df.replace(['E PROVIDENCE','EAST PROVIDENCE','ESAT PROVIDENCE','RIVERSIDE','RUMFORD'],'EAST PROVIDENCE')
RI_df = RI_df.replace(['CLAYVILLE'],'FOSTER')
RI_df = RI_df.replace(['CHAPACHET','CHEPACHET','HARMONY'],'GLOCESTER')
RI_df = RI_df.replace(['ASHAWAY','HOPE VALLEY','ROCKVILLE'],'HOPKINTON')
RI_df = RI_df.replace(['ALBION','FAIR LAWN','LINCON','MANVILLE'],'LINCOLN')
RI_df = RI_df.replace(['ADAMSVILLE','L. COMPTON','LITTLE COMPTO','MANVILLE'],'LITTLE COMPTON')
RI_df = RI_df.replace(['MIDDLETWON'],'MIDDLETOWN')
RI_df = RI_df.replace(['NARRAGANSETT','NARRAGONSETT','SAUNDERSTOWN','SAUNDERSTROWN','NARRAGANSET'],'NARRAGANSETT')
RI_df = RI_df.replace(['BLOCK ISLAND'],'NEW SHOREHAM')
RI_df = RI_df.replace(['N KINGSTOWN','N. KINGSTOWN','NO.KINGSTOWN','NORTH KING','SLOCUM','WCKFORD','WICKFORD'],'NORTH KINGSTOWN')
RI_df = RI_df.replace(['N PROVIDENCE','N. PROVIDENCE','N.PROV.','NO. PROVIDENCE','NORTHPROVIDENCE'],'NORTH PROVIDENCE')
RI_df = RI_df.replace(['FORESTDALE','N SMITHFIELD','SLATERSVILLE'],'NORTH SMITHFIELD')
RI_df = RI_df.replace(['PRUDENCE ISL'],'PORTSMOUTH')
RI_df = RI_df.replace(['SHANNOCK','WOOD RIVER JUNCTION','WYOMING'],'RICHMOND')
RI_df = RI_df.replace(['HOPE','N SCITUATE','N. SCITUATE','N.SCITUATE','NORTH SCITUATE'],'SCITUATE')
RI_df = RI_df.replace(['GREENVILLE','SMITHFIELDSMITHFIELD'],'SMITHFIELD')
RI_df = RI_df.replace(['KINGSTON','PEACE DALE','S KINGSTOWN','S. KINGSTOWN','SOUTH KINGSTOWN','WAKEFIELD','WEDT KINGSTON','WEST KINGSTON'],'SOUTH KINGSTOWN')
RI_df = RI_df.replace(['WARIWCK','WAWICK','WARWICK. RHODE ISLAND'],'WARWICK')
RI_df = RI_df.replace(['W GREENWICH','WSET GREENWICH'],'WEST GREENWICH')
RI_df = RI_df.replace(['WEST WARWWICK'],'WEST WARWICK')
RI_df = RI_df.replace(['BRADFORD'],'WESTERLY')
RI_df['contbr_city'].value_counts()
PROVIDENCE 5428 CRANSTON 1916 SOUTH KINGSTOWN 1754 WARWICK 1553 NEWPORT 1286 NORTH KINGSTOWN 1170 BARRINGTON 1003 EAST PROVIDENCE 968 PAWTUCKET 854 EAST GREENWICH 816 NARRAGANSETT 745 BRISTOL 737 WESTERLY 555 CUMBERLAND 553 PORTSMOUTH 509 LINCOLN 485 WEST WARWICK 456 JAMESTOWN 435 COVENTRY 419 MIDDLETOWN 399 NORTH PROVIDENCE 375 CHARLESTOWN 309 WARREN 291 NEW SHOREHAM 288 TIVERTON 286 SCITUATE 279 WOONSOCKET 264 LITTLE COMPTON 261 GLOCESTER 218 JOHNSTON 202 SMITHFIELD 196 NORTH SMITHFIELD 167 BURRILLVILLE 156 HOPKINTON 143 RICHMOND 132 FOSTER 125 WEST GREENWICH 73 EXETER 62 CENTRAL FALLS 20 Name: contbr_city, dtype: int64
# dictionary of RI Counties
county_map = {'BARRINGTON': 'BRISTOL',
'BRISTOL': 'BRISTOL',
'BURRILLVILLE': 'PROVIDENCE',
'CENTRAL FALLS': 'PROVIDENCE',
'CHARLESTOWN': 'WASHINGTON',
'COVENTRY': 'KENT',
'CRANSTON': 'PROVIDENCE',
'CUMBERLAND': 'PROVIDENCE',
'EAST GREENWICH': 'KENT',
'EAST PROVIDENCE': 'PROVIDENCE',
'EXETER': 'WASHINGTON',
'FOSTER': 'PROVIDENCE',
'GLOCESTER': 'PROVIDENCE',
'HOPKINTON': 'WASHINGTON',
'JAMESTOWN': 'NEWPORT',
'JOHNSTON': 'PROVIDENCE',
'LINCOLN': 'PROVIDENCE',
'LITTLE COMPTON': 'NEWPORT',
'MIDDLETOWN': 'NEWPORT',
'NARRAGANSETT': 'WASHINGTON',
'NEWPORT': 'NEWPORT',
'NEW SHOREHAM': 'WASHINGTON',
'NORTH KINGSTOWN': 'WASHINGTON',
'NORTH PROVIDENCE': 'PROVIDENCE',
'NORTH SMITHFIELD': 'PROVIDENCE',
'PAWTUCKET': 'PROVIDENCE',
'PORTSMOUTH': 'NEWPORT',
'PROVIDENCE': 'PROVIDENCE',
'RICHMOND': 'WASHINGTON',
'SCITUATE': 'PROVIDENCE',
'SMITHFIELD': 'PROVIDENCE',
'SOUTH KINGSTOWN': 'WASHINGTON',
'TIVERTON': 'NEWPORT',
'WARREN': 'BRISTOL',
'WARWICK': 'KENT',
'WESTERLY': 'WASHINGTON',
'WEST GREENWICH': 'KENT',
'WEST WARWICK': 'KENT',
'WOONSOCKET': 'PROVIDENCE'}
# creating a party column and mapping party to canidate
RI_df['County'] = RI_df.contbr_city.map(county_map)
RI_df['County'].value_counts()
PROVIDENCE 12206 WASHINGTON 5158 KENT 3317 NEWPORT 3176 BRISTOL 2031 Name: County, dtype: int64
RI_df.to_csv('RI_Pres_Election_2016.csv')