Pandas as a Database

Rather than installing postrgres and Django and configuring all that You can just use a Pandas collection of DataFrame tables as your database Lets see if we can find a "primary key" that we can use to connect a couple of these tables

In [93]:
import pandas as pd
pacs_scraped = pd.DataFrame.from_csv('public.raw_committees_scraped.csv')  # id
pacs = pd.DataFrame.from_csv('public.raw_committees.csv')  # no ID that I can find
candidates = pd.DataFrame.from_csv('public.raw_candidate_filings.csv')  # id_nmbr
print(pacs_scraped.info())
print(candidates.info())
<class 'pandas.core.frame.DataFrame'>
Index: 1415 entries, The Good Government Council to Citizens Against Lawsuit Abuse PAC
Data columns (total 24 columns):
id                                     1415 non-null int64
acronym                                171 non-null object
pac_type                               415 non-null object
filing_effective_from                  1415 non-null object
filing_type                            1415 non-null object
address                                1415 non-null object
campaign_phone                         1238 non-null object
treasurer_name                         1415 non-null object
treasurer_mailing_address              1415 non-null object
treasurer_work_phone_home_phone_fax    1369 non-null object
treasurer_email_address                1412 non-null object
candidate_name                         766 non-null object
candidate_election_office              766 non-null object
candidate_party_affiliation            761 non-null object
candidate_candidate_address            766 non-null object
candidate_work_phone_home_phone_fax    725 non-null object
candidate_mailing_address              766 non-null object
candidate_email_address                766 non-null object
candidate_occupation                   766 non-null object
candidate_employer                     650 non-null object
measure_election                       1 non-null object
measure_support                        1 non-null object
measure_details                        1 non-null object
committee_type                         1415 non-null object
dtypes: int64(1), object(23)
memory usage: 276.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 2353 entries, 2014 General Election to 2008 Primary Election
Data columns (total 51 columns):
election_year                                              2353 non-null int64
office_group                                               2351 non-null object
id_nbr                                                     2351 non-null float64
office                                                     2351 non-null object
candidate_office                                           2351 non-null object
candidate_file_rsn                                         2353 non-null int64
file_mthd_ind                                              2353 non-null object
filetype_descr                                             2353 non-null object
party_descr                                                2351 non-null object
major_party_ind                                            2351 non-null object
cand_ballot_name_txt                                       2353 non-null object
occptn_txt                                                 2342 non-null object
education_bckgrnd_txt                                      510 non-null object
occptn_bkgrnd_txt                                          2323 non-null object
school_grade_diploma_degree_certificate_course_of_study    2180 non-null object
prev_govt_bkgrnd_txt                                       2306 non-null object
judge_incbnt_ind                                           2350 non-null object
qlf_ind                                                    2348 non-null object
filed_date                                                 2353 non-null object
file_fee_rfnd_date                                         0 non-null float64
witdrw_date                                                0 non-null float64
withdrw_resn_txt                                           0 non-null float64
pttn_file_date                                             13 non-null object
pttn_sgnr_rqd_nbr                                          120 non-null float64
pttn_signr_filed_nbr                                       118 non-null float64
pttn_cmplt_date                                            11 non-null object
ballot_order_nbr                                           2279 non-null float64
prfx_name_cd                                               1187 non-null object
first_name                                                 2353 non-null object
mdle_name                                                  1294 non-null object
last_name                                                  2353 non-null object
sufx_name                                                  33 non-null object
title_txt                                                  41 non-null object
mailing_addr_line_1                                        2353 non-null object
mailing_addr_line_2                                        58 non-null object
mailing_city_name                                          2317 non-null object
mailing_st_cd                                              2317 non-null object
mailing_zip_code                                           2317 non-null float64
mailing_zip_plus_four                                      157 non-null float64
residence_addr_line_1                                      2353 non-null object
residence_addr_line_2                                      14 non-null object
residence_city_name                                        2089 non-null object
residence_st_cd                                            2089 non-null object
residence_zip_code                                         2089 non-null float64
residence_zip_plus_four                                    101 non-null float64
home_phone                                                 1673 non-null object
cell_phone                                                 358 non-null object
fax_phone                                                  706 non-null object
email                                                      2247 non-null object
work_phone                                                 1783 non-null object
web_address                                                1114 non-null object
dtypes: float64(11), int64(2), object(38)
memory usage: 955.9+ KB
None

Primary Keys?

Find PK and foreign key fields to be able to join across tables

In [83]:
import re
from itertools import product
regex = re.compile(r'(\b|_|^)[Ii][Dd](\b|_|$)')
pac_id_cols = [col for col in pacs.columns if regex.search(col)]
print(pac_id_cols)
pac_scraped_id_cols = [col for col in pacs_scraped.columns if regex.search(col)]
print(pac_scraped_id_cols)
candidate_id_cols = [col for col in candidates.columns if regex.search(col)]
print(candidate_id_cols)
trans = pd.DataFrame.from_csv('public.raw_committee_transactions_ammended_transactions.csv')
trans_id_cols = [col for col in trans.columns if regex.search(col)]
print(trans_id_cols)
tables = [('pac', pacs, pac_id_cols), ('pac_scraped', pacs_scraped, pac_scraped_id_cols), ('candidate', candidates, candidate_id_cols), ('trans', trans, trans_id_cols)]
graph = []
for ((n1, df1, cols1), (n2, df2, cols2)) in product(tables, tables):
    if n1 == n2:
        continue
    for col1 in cols1:
        for col2 in cols2:
            s1 = set(df1[col1].unique())
            s2 = set(df2[col2].unique())
            similarity = float(len(s1.intersection(s2))) / float(len(s1.union(s2)))
            print('{}.{} -- {:.3} -- {}.{}'.format(n1, col1, similarity, n2, col2 ))
            graph += [(n1, col1, similarity, n2, col2)]
graph = pd.DataFrame(sorted(graph, key=lambda x:x[2]), columns=['table1', 'column1', 'similarity', 'table2', 'column2'])
print(graph)
[]
['id']
['id_nbr']
['original_id', 'contributor_payee_committee_id', 'filer_id']
pac_scraped.id -- 0.00114 -- candidate.id_nbr
pac_scraped.id -- 0.0 -- trans.original_id
pac_scraped.id -- 0.0283 -- trans.contributor_payee_committee_id
pac_scraped.id -- 0.169 -- trans.filer_id
candidate.id_nbr -- 0.00114 -- pac_scraped.id
candidate.id_nbr -- 0.0 -- trans.original_id
candidate.id_nbr -- 0.00239 -- trans.contributor_payee_committee_id
candidate.id_nbr -- 0.00908 -- trans.filer_id
trans.original_id -- 0.0 -- pac_scraped.id
trans.contributor_payee_committee_id -- 0.0283 -- pac_scraped.id
trans.filer_id -- 0.169 -- pac_scraped.id
trans.original_id -- 0.0 -- candidate.id_nbr
trans.contributor_payee_committee_id -- 0.00239 -- candidate.id_nbr
trans.filer_id -- 0.00908 -- candidate.id_nbr
         table1                         column1  similarity       table2  \
0   pac_scraped                              id    0.000000        trans   
1     candidate                          id_nbr    0.000000        trans   
2         trans                     original_id    0.000000  pac_scraped   
3         trans                     original_id    0.000000    candidate   
4   pac_scraped                              id    0.001139    candidate   
5     candidate                          id_nbr    0.001139  pac_scraped   
6     candidate                          id_nbr    0.002392        trans   
7         trans  contributor_payee_committee_id    0.002392    candidate   
8     candidate                          id_nbr    0.009079        trans   
9         trans                        filer_id    0.009079    candidate   
10  pac_scraped                              id    0.028276        trans   
11        trans  contributor_payee_committee_id    0.028276  pac_scraped   
12  pac_scraped                              id    0.169406        trans   
13        trans                        filer_id    0.169406  pac_scraped   

                           column2  
0                      original_id  
1                      original_id  
2                               id  
3                           id_nbr  
4                           id_nbr  
5                               id  
6   contributor_payee_committee_id  
7                           id_nbr  
8                         filer_id  
9                           id_nbr  
10  contributor_payee_committee_id  
11                              id  
12                        filer_id  
13                              id  
In [86]:
print(pacs_scraped.index.dtype)
print(pacs.index.dtype)
object
int64
In [87]:
trans = pd.DataFrame.from_csv('public.raw_committee_transactions_ammended_transactions.csv')
trans.describe()
Out[87]:
original_id amount aggregate_amount contributor_payee_committee_id filer_id intrst_rate check_nbr addr_book_agent_name zip zip_plus_four exp_date
count 2997.000000 2997.000000 2997.000000 160.000000 2997.000000 9 5.080000e+02 0 2355.000000 150.000000 0
mean 1885216.702369 1530.411655 4623.306210 7128.418750 9703.807808 0 5.985057e+06 NaN 90213.352017 5940.713333 NaN
std 211274.747411 10621.464988 30598.010565 6719.423466 6866.094987 0 1.331107e+08 NaN 21051.288729 2989.624421 NaN
min 71433.000000 -597.630000 0.000000 3.000000 10.000000 0 0.000000e+00 NaN 2117.000000 1.000000 NaN
25% 1864267.000000 35.000000 118.000000 629.000000 2480.000000 0 1.012750e+03 NaN 97038.000000 3498.000000 NaN
50% 1929490.000000 150.000000 250.000000 4749.000000 11508.000000 0 1.167500e+03 NaN 97219.000000 6628.000000 NaN
75% 1979015.000000 500.000000 977.650000 14931.000000 17001.000000 0 1.979250e+03 NaN 97401.000000 8652.000000 NaN
max 2049455.000000 452550.000000 543057.000000 17370.000000 17454.000000 0 3.000184e+09 NaN 99362.000000 9998.000000 NaN

Original_ID?

So it looks like there are multiple revisions for many of the "unique" original_id. So to consolidate those revisions into unqiue records with unique IDs (take the most recent revision as the official record):

In [88]:
filtered_trans = []
for id in trans.original_id.unique():
    rows = sorted(trans[trans.original_id == id].iterrows(), key=lambda x:x[1].attest_date, reverse=True)
    filtered_trans += [rows[0][1]]
filtered_trans = pd.DataFrame(filtered_trans)
print(len(trans) / float(len(filtered_trans)))
print(filtered_trans.describe())
1.11
          original_id         amount  aggregate_amount  \
count     2700.000000    2700.000000       2700.000000   
mean   1884926.093333    1398.428496       4339.031659   
std     222078.767780   10643.772144      31247.914447   
min      71433.000000    -597.630000          0.000000   
25%    1864223.750000      35.000000        108.290000   
50%    1942541.000000     125.000000        238.805000   
75%    1979260.500000     500.000000        767.900000   
max    2049455.000000  452550.000000     543057.000000   

       contributor_payee_committee_id      filer_id  intrst_rate  \
count                      129.000000   2700.000000            9   
mean                      7872.085271   9612.684444            0   
std                       6991.062308   6897.686139            0   
min                          3.000000     10.000000            0   
25%                        470.000000   2189.000000            0   
50%                       4831.000000  11487.000000            0   
75%                      15323.000000  17007.000000            0   
max                      17370.000000  17454.000000            0   

          check_nbr  addr_book_agent_name           zip  zip_plus_four  \
count  4.210000e+02                     0   2087.000000     144.000000   
mean   7.221467e+06                   NaN  89631.470532    6013.465278   
std    1.462182e+08                   NaN  21828.475998    3018.173828   
min    0.000000e+00                   NaN   2117.000000       1.000000   
25%    1.014000e+03                   NaN  97034.000000    3436.750000   
50%    1.177000e+03                   NaN  97218.000000    6707.000000   
75%    2.060000e+03                   NaN  97404.000000    8652.000000   
max    3.000184e+09                   NaN  99362.000000    9998.000000   

       exp_date  
count         0  
mean        NaN  
std         NaN  
min         NaN  
25%         NaN  
50%         NaN  
75%         NaN  
max         NaN  
In [89]:
df = filtered_trans
filer_sums = df.groupby('filer_id').amount.sum()
print(pacs_scraped.columns)
print(df.columns)
for (filer_id, amount) in sorted(filer_sums.iteritems(), key=lambda x:x[1], reverse=True):
    names = pacs_scraped[pacs_scraped.id == filer_id].index.values
    print('{}\t{}\t{}'.format(filer_id, names[0][:40] if len(names) else '', amount))
Index([u'id', u'acronym', u'pac_type', u'filing_effective_from',
       u'filing_type', u'address', u'campaign_phone', u'treasurer_name',
       u'treasurer_mailing_address', u'treasurer_work_phone_home_phone_fax',
       u'treasurer_email_address', u'candidate_name',
       u'candidate_election_office', u'candidate_party_affiliation',
       u'candidate_candidate_address', u'candidate_work_phone_home_phone_fax',
       u'candidate_mailing_address', u'candidate_email_address',
       u'candidate_occupation', u'candidate_employer', u'measure_election',
       u'measure_support', u'measure_details', u'committee_type'],
      dtype='object')
Index([u'original_id', u'tran_date', u'tran_status', u'filer',
       u'contributor_payee', u'sub_type', u'amount', u'aggregate_amount',
       u'contributor_payee_committee_id', u'filer_id', u'attest_by_name',
       u'attest_date', u'review_by_name', u'review_date', u'due_date',
       u'occptn_ltr_date', u'pymt_sched_txt', u'purp_desc', u'intrst_rate',
       u'check_nbr', u'tran_stsfd_ind', u'filed_by_name', u'filed_date',
       u'addr_book_agent_name', u'book_type', u'title_txt', u'occptn_txt',
       u'emp_name', u'emp_city', u'emp_state', u'employ_ind',
       u'self_employ_ind', u'addr_line1', u'addr_line2', u'city', u'state',
       u'zip', u'zip_plus_four', u'county', u'purpose_codes', u'exp_date'],
      dtype='object')
416		516102.89
17015	NO on 92 Coalition	467196.58
1524		320588.88
17007	Vote Yes on Measure 92: We have the righ	294510.58
33		161334.53
17394	MONICA PAC	149530.55
4155	Citizens to Elect Dennis Richardson	100250.0
1471		71915.1
17091	Oregon Liberty Alliance	69466.19
17192	Achieving the American Dream Coalition	66041.23
16892	Every Oregon Voter Counts	65000.0
11508		55756.03
17155	Open Primaries	50000.0
142	Oregon Education Association - People fo	37448.75
16428	Jobs Political Action Committee	36000.0
13920	Kitzhaber for Governor	35947.3
15477	Close Friends PAC	34528.56
5690		32438.47
16617	Matt Geiger for Oregon	29129.22
6845		26257.0
16603	Friends of Jodi Bailey	25315.48
13807		25150.0
14232	Friends of Chuck Thomsen	24858.41
1579		24382.0
16959	Kyle Markley for Oregon	23980.3
14044	Alan Olsen for Oregon Senate Committee	19118.29
17154	Community United For Safety	18351.11
17010	Friends of Patriot Hall	16278.3
5528		15331.08
16982	Women's Constitutional Equality PAC	14840.28
3889		14817.7
4680	Sara Gelser for State Senate	14392.22
125		13470.0
13965		13164.93
15776		13041.9
51	Credit Union Legislative Action Fund	13000.0
5785	Central Oregon Association of Realtors P	12883.0
682		12767.24
17286	Coastal Lakes Environmental Action Netwo	12755.58
16368	Campaign for the Restoration and Regulat	12525.0
13635	Committee to Elect Mike McLane	12500.0
2793		12418.41
17044	Yes on 91	12217.14
17245	Capitol Watch Political Action Committee	12000.0
14201	Friends of Bill Hansell	11147.65
16887	Andrew Petersen For Oregon	10850.0
315	Jackson County Republican Central Commit	10573.22
16171	New Approach Oregon	10568.39
4797		10500.0
4893		10450.0
2189		9271.5
17097	Boddie For Bend	9043.44
16508	Friends of Paul Evans	8988.25
15255	Friends of Elizabeth Steiner Hayward	8968.87
15334		8840.36
6724		8700.0
17435	Oregon Citizens to Recall Senator Floyd 	8387.68
4241	Elect Betty Komp	8276.95
17159	Protect Your Investment PAC	8267.5
4726		8131.6
15728	Committee to Elect Toni Webb - Josephine	8055.12
17162	Bend Good Government Committee	8048.0
15089	Charlie Hales for Mayor	8041.62
314	Jackson County Democratic Central Commit	7757.0
16809	Bridget Bailey for Wasco County Commissi	7438.95
329		7414.14
17295	Community United for Safety	7364.64
16889	YES on Oregon Safe Roads	7049.09
5446		6783.36
17001	Vote Yes on 90	6756.07
15145		6657.38
5328		6600.0
10		6500.0
11247		6377.54
306		6290.81
39	Oregon Trial Lawyers Association PAC	6162.5
17180	Friends and Neighbors for a Safe and Pro	6105.9
14509		6098.62
16541	Hayden for Oregon	6000.0
17277	Rask for School Board	5885.58
17343	Friends of Lars Hedbor	5830.9
15200	Friends of Jamie Damon	5697.66
17202	Save Columbia County Jobs	5564.26
16800	Oregon Anti-Crime Alliance PAC	5520.82
2352		5500.0
2480		5500.0
16426	Sal Peralta for Commissioner	5495.01
223		5267.9
17045	Fix Our Parks, a project of the Conserva	5232.8
4813		5170.0
17		5000.0
175	Oregon Nurseries Political Action Commit	5000.0
5667	Rogue Valley Association of Realtors Pol	5000.0
15683		5000.0
16988	Committee to Elect Leif for Douglas Coun	5000.0
338	Polk County Republican Central Committee	4751.98
16520	Friends of Tim Casey for Sheriff	4714.31
16480	Oregon Laborers Political Action Committ	4559.62
3428		4500.0
14961		4500.0
16537	Act Now for a Healthy Oregon Political C	4500.0
266		4427.63
17239	Millersburg Residents for a Responsible 	4395.31
16559	Friends of Mark Richman	4210.75
17199	Committee to Restore, Revitalize and Reo	4179.4
15497	Friends of Steve Newgard	4066.4
16610	Protect Oregon Driver Licenses Committee	4007.8
7846		4002.11
14704		3989.15
13997	Friends of Lew Frederick	3942.8
312	Hood River County Democratic Central Com	3861.54
5203		3835.0
14592		3828.38
319		3806.89
15858	John Davis For Oregon	3772.27
5079		3771.08
15343		3759.0
379		3755.0
5208		3739.48
5580		3621.34
5089	Kevin Cameron for Oregon	3526.5
4745	Oregonians for Immigration Reform Politi	3500.0
15406	Elect Ellen Rosenblum for Attorney Gener	3500.0
17031	LBCC for the Community	3500.0
17397	Bud Pierce for a Better Oregon	3500.0
15	Association of Engineering Employees of 	3483.69
16501	Friends of Robert Bruce	3450.0
16433		3281.93
26	Cable Operators Political Action Committ	3202.5
3865		3130.0
17247	Citizens for Marion County Extension	3015.57
11		3000.0
5410	Oregon Cattle PAC	3000.0
13571	Oregon Ambulatory Surgery Center Associa	3000.0
17150	Friends of Cloverdale Firefighters	3000.0
15621		2970.44
17334	Neighbors for a New Hospital	2850.0
299	Democratic Party of Coos County	2827.04
17285	Friends of Amy Kohnstamm	2765.0
7305		2600.0
17082	Beaverton Public Safety	2549.46
12786		2544.87
16196	We're for a Constitutional Government	2538.0
14377	Committee to Elect Loretta Smith	2515.0
1670		2500.0
4327		2500.0
5486		2500.0
14515		2500.0
16813	Grow PAC	2500.0
15872	Central Oregon Patriots Political Action	2455.42
325	Lane County Republican Central Committee	2382.37
113		2347.94
17077	Friends of Ray Eppley	2321.5
6104		2302.38
347	Wasco County Republican Central Committe	2263.25
15701		2249.83
3219		2240.76
618	Deschutes County Republican Central Comm	2229.69
4259		2143.21
17034	Anita Yap for PCC Board	2125.0
17057	Joe Buck For L.O.	2100.0
4572		2041.4
3604		2030.0
2307		2000.0
191	Oregon Realtors Political Action Committ	1970.03
15658		1925.29
303		1920.0
171		1898.49
16258	Friends of Diana Dickey	1890.0
101		1875.81
17067	Citizens for Improved and Safer Streets	1866.11
15519		1860.16
15348	Gomberg for State Rep	1850.0
17242	Friends of Andrew Davidson	1850.0
1686		1750.0
290	Baker County Republican Central Committe	1728.74
14049		1727.96
14984		1700.04
17158	Melton4pudboard	1690.0
4586	Oregon Federation of College Republicans	1689.5
17269	Truth In Sentencing	1680.0
11487		1668.85
16802	Oregonians for Safe Farms and Families	1656.43
16535	Elect Kerry Atherton	1648.91
17372	Friends of Trisha Claxton	1632.5
14407		1626.0
16927	Friends of James 2016	1623.2
13967		1622.2
6945		1600.0
17076	Jody Carson for City Council	1594.61
17128	Donna Lane Committee for City Council	1566.05
12493		1540.0
80		1520.0
7606		1500.0
15808	Coalition for a Healthy Oregon PAC	1500.0
15330	Friends of Jessica Vega Pederson	1494.1
17370	Youth Actvities and Academics Levy	1484.39
14608		1449.0
16812	Friends of Susan McLain	1426.8
16985	Friends of Andrew Kaza	1400.0
16434		1385.46
16538	Jodi Hack For Oregon	1383.05
17018	LaMear for Mayor	1377.23
294		1362.59
17352	Committee to Elect Lilia Caballero	1326.0
16728	Oregon GMO Right to Know	1284.0
17418	Fell for Parks	1250.42
353		1250.0
3073	Oregon Business Association PAC	1250.0
3420		1250.0
17423	Friends of Paul Carlos Southwick	1250.0
16972	Friends of Don Williams for Mayor	1232.1
16648	Scott Jackson For Sheriff	1229.26
324	Democratic Party of Lane County	1207.0
17065	OSPIRG Action	1197.65
4152		1180.0
4270		1174.25
16679	Stop the Bull Run Takeover PAC	1152.15
17119	Paul Anthony for Portland School Board	1150.0
14584		1100.0
17175	Free Our City	1100.0
14342		1064.84
335	Multnomah County Democratic Central Comm	1059.0
4792		1057.15
470		1050.0
15109	Novick for Portland	1030.0
16676	Runyan For Oregon	1021.0
16877	An Independent Voice for Oregon	1009.76
5709		1005.09
17028	Tracie Heidt for City Council	1005.0
16819	Friends of Daniel Benjamin	1000.01
3584		1000.0
4395		1000.0
5133		1000.0
10967		1000.0
15103		1000.0
16411	Stan Primozich for Yamhill County Commis	1000.0
16767	Nearman4Oregon	1000.0
348	Washington County Democratic Central Com	997.03
17121	Dennis Roler for City Council	984.07
17266	Friends of Julie Esparza Brown	976.07
169		965.98
16406		955.61
14531		950.0
17184	Price for PUD Board	920.0
17326	Friends of Christian Honl	900.0
17360	Batchelor for Board PAC	893.44
108		870.0
16936	Citizens for Neighborhood Livability	870.0
15628	Joel Ockunzzi for Jackson County Commiss	825.4
17342	Friends of Stephen Marc Beaudoin	819.7
16983	Ericksen For Oregon	813.0
14575	ChiroPAC	800.0
16757	Kathleen Taylor for Oregon	800.0
17106	Pam Ackley Campaign Account	800.0
17275	Friends of Renee for city council	795.0
4213		789.26
14183		762.16
1869		750.0
5885		750.0
16633	Committee to Elect Cathleen Callahan	750.0
16880	Lottery Local Control Committee 2016	750.0
17353	Becky Tymchuk for Beaverton School Board	750.0
17425	reelectbob	750.0
350		748.95
17407	Re-elect CTC	727.0
16624	Elect Judge Jean Marie Martwick	721.55
17328	Tina M French	709.08
16893	Laura Cooper for Rep	700.3
351	Yamhill County Republican Party	700.0
14324		700.0
5405		650.0
14242		650.0
17122	Friends of Karen Leach	646.0
5192		635.98
14277		615.0
16034	Barb for Bend	601.52
16732	Taylor For Sheriff	600.0
17257	Mike Rosen for Portland School Board	600.0
17349	Robert Keyser for Port Commission	600.0
5544		586.95
14892		570.0
5434		564.25
16938	Committee to Elect Curt Chancler for Cou	562.05
5585	Pete Sorenson Committee	550.0
16902	Clark for Mayor	550.0
15192	Buehler For a United Oregon	548.48
17261	Creswell for Affordable Water	538.09
3396		515.0
336		508.84
321	Klamath County Republican Central Commit	504.99
97	Mortgage Industry for Responsible Govern	500.0
161	Oregon Hospital Political Action Committ	500.0
228		500.0
317		500.0
1430		500.0
4846		500.0
5152		500.0
12631		500.0
12848	Douglas County Physicians PAC	500.0
13619	Oregon Aging & Disability Services Polit	500.0
13648		500.0
14223		500.0
16057	Oregon State Ass'n of Plumbing Heating C	500.0
16583	Friends of Bill Post	500.0
16680	Small Business PAC	500.0
16750	Vote Tom Brown County Commissioner	500.0
16753	Friends of Nick Caleb	500.0
16777	Elect Jackson Commissioner	500.0
16904	Friends of Rick Rose	500.0
16953	Humane Oregon PAC	500.0
17101	Lisa Seales for Bend City Council	500.0
328		488.62
291		469.99
17267	Elect Eileen Nittler	464.95
17096	Elect Tom Anderson	459.0
3477		450.0
16388	Support Local Food Rights	450.0
2109		446.84
3674		444.25
16907	Friends of Dan Chriestenson	411.5
12633		400.0
15237	Friends of Alissa Keny-Guyer	400.0
17090	Committee for Better Transit Now	400.0
17314	Support Our Sheriff	400.0
15298		399.8
16673	Friends of Kyle Allen	395.18
304	Deschutes County Democratic Central Comm	390.47
17151	Sustainable Hood River	390.0
17323	Lane Citizens for Safe Roads	388.87
2690		377.27
17335	Friends of Ivonne Gebhardt	357.42
16131	Communities of Color for a Just Oregon	320.0
3077	Oregon Independent Automobile Dealers As	315.0
6004	ArtPAC	313.48
10087		300.0
16616	Team BSW	295.35
14220		290.25
13966	Henry Heimuller for Columbia County Comm	287.74
14969		280.0
10367	The Good Government Council	276.13
16908	Taxpayer Defense Project PAC	275.0
283	Women's Investment Network PAC	270.58
16154	Portland Clean Air	266.54
302	Curry County Democratic Central Committe	257.01
16042	Scott Barbur for Milwaukie	256.51
16808	Lawrence for Judge	252.0
2803		250.0
2829	Oregon Refuse and Recycling Association 	250.0
3917		250.0
12688	Committee to Elect Seth Crawford	250.0
14391		250.0
15704		250.0
16615	Schuster for Commissioner	250.0
16738	Ken Helm for House District 34	250.0
16910	Fair and Unified Elections Committee	238.93
16381	Building Tualatin's Future	231.14
14765	Friends of John Goodhouse	223.98
13436		221.25
17347	Public Employee Choice Act Committee	218.96
293	Clackamas County Democratic Central Comm	218.76
180	Oregon Physical Therapy PAC	201.0
216		200.0
320	Klamath County Democratic Central Commit	200.0
5458		200.0
5926		200.0
12943		200.0
15741	Oregon End Violence Against Women Politi	200.0
16548	Elect Craig Wilhelm	200.0
17125	Yes for Parks 3-451	200.0
17339	Benton County Citizens Opposed to Measur	200.0
17454	Citizens for a Healthy County	200.0
2278	Committee to Elect Jeff Kruse	199.0
17022	Vote Colt	196.88
16384	Citizens for Moving Damascus Forward	196.0
16761	Friends of Stephanie Nystrom	187.5
16814	Friends of Carla Piluso	180.78
15755	Campaign for Mark Webb	180.0
17174	Committee to Elect Wendy Engler	177.79
130	Oregon Certified Public Accountants Legi	170.61
16922	ElectRayK	170.0
16373	Help End Marijuana Prohibition in Oregon	165.0
16471	Ladd Wiles for Judge	150.0
16945	Friends of Gerritt	150.0
17073	YES For Neighbors and Livability	140.36
13561		140.0
147		135.05
16709	Citizens to Elect Carl Wilson	131.37
15659	Future of Racing in Oregon Group PAC	131.15
16999	Elect Jackie Pride for Tualatin	127.0
14252	Citizens for Dave Dotterrer	125.74
7665		125.0
17166	Freedom From Pesticides Alliance	125.0
5583		121.13
931		120.33
15469	Friends of Herman Baertschiger	113.18
221		110.92
14993		107.08
5783		105.0
12916		104.0
16780	Committee to Elect Darlene V. Taylor	102.0
13774	Family Physicians of Oregon PAC	100.0
13837		100.0
15684		100.0
16989	Committee to Recall Hales	100.0
17107	Committee to elect Michael Zarosinski	100.0
17415	Friends of Colleen Busch	100.0
16528	Jodie For Deschutes	97.75
17363	Committee to Elect Janet Lightheart	97.0
3215		96.45
14922		85.94
4741	Friends of Chip Shields	80.0
16454		75.52
16969	Sonnleitner for Zone 3 PCC	75.24
16543	Friends of John Turner	68.88
16459	Friends of Rob Nosse	60.0
14490	Oregon Technology PAC	55.98
318	Josephine County Democratic Central Comm	53.74
342	Umatilla County Republican Central Commi	50.0
14523	The Mother PAC	50.0
16825	Barrett for Clatsop County Commission #5	50.0
17308	Elect Kent Norris	33.7
15523	Committee to Elect Jan Giunta	30.37
16168		30.0
292		25.0
4100	Bar Pilots of the Columbia River Politic	25.0
17264	Benton Food Freedom	20.99
17387	Supporters of Chelsea Martin	20.0
4824		18.99
145		18.26
247	SalPAC	17.5
3698		10.0
16407		8.0
622	Libertarian Party of Oregon	0.43
6106		0.33

NLP

Let's build a graph of the similarity between PACs based on the wording of their committee names

In [90]:
import matplotlib
%matplotlib inline
np = pd.np
np.norm = np.linalg.norm
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
In [101]:
df = pacs_scraped
names = df.index.values
corpus = [' '.join(str(f) for f in fields) for fields in zip(*[df[col] for col in df.columns if df[col].dtype == pd.np.dtype('O')])]
print(corpus[:3])
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), stop_words='english')
tfidf = vectorizer.fit_transform(corpus)
cov = tfidf * tfidf.T
cov[0:]
['GCC Miscellaneous 01/03/2014 to present Amendment 435 W First Ave P.O. Box 548 Albany, OR 97321 (541)926-1517 Janet Steele 435 1st Avenue W. Albany, OR 97321 (541)926-1517 (541)979-2395 (541)926-7064 [email protected] nan nan nan nan nan nan nan nan nan nan nan nan PAC', 'nan nan 11/29/2012 to 11/29/2012 Discontinuation 28356 SW Wagner Street Wilsonville, OR 97070 (503)685-7346 Carol A. Russell 89358 Cranberry Lane Bandon, OR 97411 (800)893-1005 (541)347-4423 (800)967-7380 [email protected] Matt Wingard 2012 Primary Election State Representative, 26th District Republican 28356 SW Wagner St. Wilsonville, OR 97070 (503)351-2955 28356 SW Wagner St. Wilsonville, OR 97070 [email protected] Public Relations Beambo, Inc., Wilsonville, OR nan nan nan CC', 'nan nan 04/30/2013 to present Amendment 3321 SE 20th Ave Portland, OR 97202 nan Kevin F. Neely PO Box 42307 Portland, OR 97242 (503)295-1851 [email protected] John Kroger 2012 Primary Election Attorney General Democrat 9530 SW 12th Dr. Portland, OR 97219 nan PO Box 42307 Portland, OR 97242 [email protected] Attorney General State of Oregon Department of Justice, Salem, OR nan nan nan CC']
Out[101]:
<1415x1415 sparse matrix of type '<type 'numpy.float64'>'
	with 2002225 stored elements in Compressed Sparse Row format>