Rather than installing postrgres and Django and configuring all that You can just use a Pandas collection of DataFrame tables as your database Lets see if we can find a "primary key" that we can use to connect a couple of these tables
import pandas as pd
pacs_scraped = pd.DataFrame.from_csv('public.raw_committees_scraped.csv') # id
pacs = pd.DataFrame.from_csv('public.raw_committees.csv') # no ID that I can find
candidates = pd.DataFrame.from_csv('public.raw_candidate_filings.csv') # id_nmbr
print(pacs_scraped.info())
print(candidates.info())
<class 'pandas.core.frame.DataFrame'> Index: 1415 entries, The Good Government Council to Citizens Against Lawsuit Abuse PAC Data columns (total 24 columns): id 1415 non-null int64 acronym 171 non-null object pac_type 415 non-null object filing_effective_from 1415 non-null object filing_type 1415 non-null object address 1415 non-null object campaign_phone 1238 non-null object treasurer_name 1415 non-null object treasurer_mailing_address 1415 non-null object treasurer_work_phone_home_phone_fax 1369 non-null object treasurer_email_address 1412 non-null object candidate_name 766 non-null object candidate_election_office 766 non-null object candidate_party_affiliation 761 non-null object candidate_candidate_address 766 non-null object candidate_work_phone_home_phone_fax 725 non-null object candidate_mailing_address 766 non-null object candidate_email_address 766 non-null object candidate_occupation 766 non-null object candidate_employer 650 non-null object measure_election 1 non-null object measure_support 1 non-null object measure_details 1 non-null object committee_type 1415 non-null object dtypes: int64(1), object(23) memory usage: 276.4+ KB None <class 'pandas.core.frame.DataFrame'> Index: 2353 entries, 2014 General Election to 2008 Primary Election Data columns (total 51 columns): election_year 2353 non-null int64 office_group 2351 non-null object id_nbr 2351 non-null float64 office 2351 non-null object candidate_office 2351 non-null object candidate_file_rsn 2353 non-null int64 file_mthd_ind 2353 non-null object filetype_descr 2353 non-null object party_descr 2351 non-null object major_party_ind 2351 non-null object cand_ballot_name_txt 2353 non-null object occptn_txt 2342 non-null object education_bckgrnd_txt 510 non-null object occptn_bkgrnd_txt 2323 non-null object school_grade_diploma_degree_certificate_course_of_study 2180 non-null object prev_govt_bkgrnd_txt 2306 non-null object judge_incbnt_ind 2350 non-null object qlf_ind 2348 non-null object filed_date 2353 non-null object file_fee_rfnd_date 0 non-null float64 witdrw_date 0 non-null float64 withdrw_resn_txt 0 non-null float64 pttn_file_date 13 non-null object pttn_sgnr_rqd_nbr 120 non-null float64 pttn_signr_filed_nbr 118 non-null float64 pttn_cmplt_date 11 non-null object ballot_order_nbr 2279 non-null float64 prfx_name_cd 1187 non-null object first_name 2353 non-null object mdle_name 1294 non-null object last_name 2353 non-null object sufx_name 33 non-null object title_txt 41 non-null object mailing_addr_line_1 2353 non-null object mailing_addr_line_2 58 non-null object mailing_city_name 2317 non-null object mailing_st_cd 2317 non-null object mailing_zip_code 2317 non-null float64 mailing_zip_plus_four 157 non-null float64 residence_addr_line_1 2353 non-null object residence_addr_line_2 14 non-null object residence_city_name 2089 non-null object residence_st_cd 2089 non-null object residence_zip_code 2089 non-null float64 residence_zip_plus_four 101 non-null float64 home_phone 1673 non-null object cell_phone 358 non-null object fax_phone 706 non-null object email 2247 non-null object work_phone 1783 non-null object web_address 1114 non-null object dtypes: float64(11), int64(2), object(38) memory usage: 955.9+ KB None
Find PK and foreign key fields to be able to join across tables
import re
from itertools import product
regex = re.compile(r'(\b|_|^)[Ii][Dd](\b|_|$)')
pac_id_cols = [col for col in pacs.columns if regex.search(col)]
print(pac_id_cols)
pac_scraped_id_cols = [col for col in pacs_scraped.columns if regex.search(col)]
print(pac_scraped_id_cols)
candidate_id_cols = [col for col in candidates.columns if regex.search(col)]
print(candidate_id_cols)
trans = pd.DataFrame.from_csv('public.raw_committee_transactions_ammended_transactions.csv')
trans_id_cols = [col for col in trans.columns if regex.search(col)]
print(trans_id_cols)
tables = [('pac', pacs, pac_id_cols), ('pac_scraped', pacs_scraped, pac_scraped_id_cols), ('candidate', candidates, candidate_id_cols), ('trans', trans, trans_id_cols)]
graph = []
for ((n1, df1, cols1), (n2, df2, cols2)) in product(tables, tables):
if n1 == n2:
continue
for col1 in cols1:
for col2 in cols2:
s1 = set(df1[col1].unique())
s2 = set(df2[col2].unique())
similarity = float(len(s1.intersection(s2))) / float(len(s1.union(s2)))
print('{}.{} -- {:.3} -- {}.{}'.format(n1, col1, similarity, n2, col2 ))
graph += [(n1, col1, similarity, n2, col2)]
graph = pd.DataFrame(sorted(graph, key=lambda x:x[2]), columns=['table1', 'column1', 'similarity', 'table2', 'column2'])
print(graph)
[] ['id'] ['id_nbr'] ['original_id', 'contributor_payee_committee_id', 'filer_id'] pac_scraped.id -- 0.00114 -- candidate.id_nbr pac_scraped.id -- 0.0 -- trans.original_id pac_scraped.id -- 0.0283 -- trans.contributor_payee_committee_id pac_scraped.id -- 0.169 -- trans.filer_id candidate.id_nbr -- 0.00114 -- pac_scraped.id candidate.id_nbr -- 0.0 -- trans.original_id candidate.id_nbr -- 0.00239 -- trans.contributor_payee_committee_id candidate.id_nbr -- 0.00908 -- trans.filer_id trans.original_id -- 0.0 -- pac_scraped.id trans.contributor_payee_committee_id -- 0.0283 -- pac_scraped.id trans.filer_id -- 0.169 -- pac_scraped.id trans.original_id -- 0.0 -- candidate.id_nbr trans.contributor_payee_committee_id -- 0.00239 -- candidate.id_nbr trans.filer_id -- 0.00908 -- candidate.id_nbr table1 column1 similarity table2 \ 0 pac_scraped id 0.000000 trans 1 candidate id_nbr 0.000000 trans 2 trans original_id 0.000000 pac_scraped 3 trans original_id 0.000000 candidate 4 pac_scraped id 0.001139 candidate 5 candidate id_nbr 0.001139 pac_scraped 6 candidate id_nbr 0.002392 trans 7 trans contributor_payee_committee_id 0.002392 candidate 8 candidate id_nbr 0.009079 trans 9 trans filer_id 0.009079 candidate 10 pac_scraped id 0.028276 trans 11 trans contributor_payee_committee_id 0.028276 pac_scraped 12 pac_scraped id 0.169406 trans 13 trans filer_id 0.169406 pac_scraped column2 0 original_id 1 original_id 2 id 3 id_nbr 4 id_nbr 5 id 6 contributor_payee_committee_id 7 id_nbr 8 filer_id 9 id_nbr 10 contributor_payee_committee_id 11 id 12 filer_id 13 id
print(pacs_scraped.index.dtype)
print(pacs.index.dtype)
object int64
trans = pd.DataFrame.from_csv('public.raw_committee_transactions_ammended_transactions.csv')
trans.describe()
original_id | amount | aggregate_amount | contributor_payee_committee_id | filer_id | intrst_rate | check_nbr | addr_book_agent_name | zip | zip_plus_four | exp_date | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 2997.000000 | 2997.000000 | 2997.000000 | 160.000000 | 2997.000000 | 9 | 5.080000e+02 | 0 | 2355.000000 | 150.000000 | 0 |
mean | 1885216.702369 | 1530.411655 | 4623.306210 | 7128.418750 | 9703.807808 | 0 | 5.985057e+06 | NaN | 90213.352017 | 5940.713333 | NaN |
std | 211274.747411 | 10621.464988 | 30598.010565 | 6719.423466 | 6866.094987 | 0 | 1.331107e+08 | NaN | 21051.288729 | 2989.624421 | NaN |
min | 71433.000000 | -597.630000 | 0.000000 | 3.000000 | 10.000000 | 0 | 0.000000e+00 | NaN | 2117.000000 | 1.000000 | NaN |
25% | 1864267.000000 | 35.000000 | 118.000000 | 629.000000 | 2480.000000 | 0 | 1.012750e+03 | NaN | 97038.000000 | 3498.000000 | NaN |
50% | 1929490.000000 | 150.000000 | 250.000000 | 4749.000000 | 11508.000000 | 0 | 1.167500e+03 | NaN | 97219.000000 | 6628.000000 | NaN |
75% | 1979015.000000 | 500.000000 | 977.650000 | 14931.000000 | 17001.000000 | 0 | 1.979250e+03 | NaN | 97401.000000 | 8652.000000 | NaN |
max | 2049455.000000 | 452550.000000 | 543057.000000 | 17370.000000 | 17454.000000 | 0 | 3.000184e+09 | NaN | 99362.000000 | 9998.000000 | NaN |
So it looks like there are multiple revisions for many of the "unique" original_id. So to consolidate those revisions into unqiue records with unique IDs (take the most recent revision as the official record):
filtered_trans = []
for id in trans.original_id.unique():
rows = sorted(trans[trans.original_id == id].iterrows(), key=lambda x:x[1].attest_date, reverse=True)
filtered_trans += [rows[0][1]]
filtered_trans = pd.DataFrame(filtered_trans)
print(len(trans) / float(len(filtered_trans)))
print(filtered_trans.describe())
1.11 original_id amount aggregate_amount \ count 2700.000000 2700.000000 2700.000000 mean 1884926.093333 1398.428496 4339.031659 std 222078.767780 10643.772144 31247.914447 min 71433.000000 -597.630000 0.000000 25% 1864223.750000 35.000000 108.290000 50% 1942541.000000 125.000000 238.805000 75% 1979260.500000 500.000000 767.900000 max 2049455.000000 452550.000000 543057.000000 contributor_payee_committee_id filer_id intrst_rate \ count 129.000000 2700.000000 9 mean 7872.085271 9612.684444 0 std 6991.062308 6897.686139 0 min 3.000000 10.000000 0 25% 470.000000 2189.000000 0 50% 4831.000000 11487.000000 0 75% 15323.000000 17007.000000 0 max 17370.000000 17454.000000 0 check_nbr addr_book_agent_name zip zip_plus_four \ count 4.210000e+02 0 2087.000000 144.000000 mean 7.221467e+06 NaN 89631.470532 6013.465278 std 1.462182e+08 NaN 21828.475998 3018.173828 min 0.000000e+00 NaN 2117.000000 1.000000 25% 1.014000e+03 NaN 97034.000000 3436.750000 50% 1.177000e+03 NaN 97218.000000 6707.000000 75% 2.060000e+03 NaN 97404.000000 8652.000000 max 3.000184e+09 NaN 99362.000000 9998.000000 exp_date count 0 mean NaN std NaN min NaN 25% NaN 50% NaN 75% NaN max NaN
df = filtered_trans
filer_sums = df.groupby('filer_id').amount.sum()
print(pacs_scraped.columns)
print(df.columns)
for (filer_id, amount) in sorted(filer_sums.iteritems(), key=lambda x:x[1], reverse=True):
names = pacs_scraped[pacs_scraped.id == filer_id].index.values
print('{}\t{}\t{}'.format(filer_id, names[0][:40] if len(names) else '', amount))
Index([u'id', u'acronym', u'pac_type', u'filing_effective_from', u'filing_type', u'address', u'campaign_phone', u'treasurer_name', u'treasurer_mailing_address', u'treasurer_work_phone_home_phone_fax', u'treasurer_email_address', u'candidate_name', u'candidate_election_office', u'candidate_party_affiliation', u'candidate_candidate_address', u'candidate_work_phone_home_phone_fax', u'candidate_mailing_address', u'candidate_email_address', u'candidate_occupation', u'candidate_employer', u'measure_election', u'measure_support', u'measure_details', u'committee_type'], dtype='object') Index([u'original_id', u'tran_date', u'tran_status', u'filer', u'contributor_payee', u'sub_type', u'amount', u'aggregate_amount', u'contributor_payee_committee_id', u'filer_id', u'attest_by_name', u'attest_date', u'review_by_name', u'review_date', u'due_date', u'occptn_ltr_date', u'pymt_sched_txt', u'purp_desc', u'intrst_rate', u'check_nbr', u'tran_stsfd_ind', u'filed_by_name', u'filed_date', u'addr_book_agent_name', u'book_type', u'title_txt', u'occptn_txt', u'emp_name', u'emp_city', u'emp_state', u'employ_ind', u'self_employ_ind', u'addr_line1', u'addr_line2', u'city', u'state', u'zip', u'zip_plus_four', u'county', u'purpose_codes', u'exp_date'], dtype='object') 416 516102.89 17015 NO on 92 Coalition 467196.58 1524 320588.88 17007 Vote Yes on Measure 92: We have the righ 294510.58 33 161334.53 17394 MONICA PAC 149530.55 4155 Citizens to Elect Dennis Richardson 100250.0 1471 71915.1 17091 Oregon Liberty Alliance 69466.19 17192 Achieving the American Dream Coalition 66041.23 16892 Every Oregon Voter Counts 65000.0 11508 55756.03 17155 Open Primaries 50000.0 142 Oregon Education Association - People fo 37448.75 16428 Jobs Political Action Committee 36000.0 13920 Kitzhaber for Governor 35947.3 15477 Close Friends PAC 34528.56 5690 32438.47 16617 Matt Geiger for Oregon 29129.22 6845 26257.0 16603 Friends of Jodi Bailey 25315.48 13807 25150.0 14232 Friends of Chuck Thomsen 24858.41 1579 24382.0 16959 Kyle Markley for Oregon 23980.3 14044 Alan Olsen for Oregon Senate Committee 19118.29 17154 Community United For Safety 18351.11 17010 Friends of Patriot Hall 16278.3 5528 15331.08 16982 Women's Constitutional Equality PAC 14840.28 3889 14817.7 4680 Sara Gelser for State Senate 14392.22 125 13470.0 13965 13164.93 15776 13041.9 51 Credit Union Legislative Action Fund 13000.0 5785 Central Oregon Association of Realtors P 12883.0 682 12767.24 17286 Coastal Lakes Environmental Action Netwo 12755.58 16368 Campaign for the Restoration and Regulat 12525.0 13635 Committee to Elect Mike McLane 12500.0 2793 12418.41 17044 Yes on 91 12217.14 17245 Capitol Watch Political Action Committee 12000.0 14201 Friends of Bill Hansell 11147.65 16887 Andrew Petersen For Oregon 10850.0 315 Jackson County Republican Central Commit 10573.22 16171 New Approach Oregon 10568.39 4797 10500.0 4893 10450.0 2189 9271.5 17097 Boddie For Bend 9043.44 16508 Friends of Paul Evans 8988.25 15255 Friends of Elizabeth Steiner Hayward 8968.87 15334 8840.36 6724 8700.0 17435 Oregon Citizens to Recall Senator Floyd 8387.68 4241 Elect Betty Komp 8276.95 17159 Protect Your Investment PAC 8267.5 4726 8131.6 15728 Committee to Elect Toni Webb - Josephine 8055.12 17162 Bend Good Government Committee 8048.0 15089 Charlie Hales for Mayor 8041.62 314 Jackson County Democratic Central Commit 7757.0 16809 Bridget Bailey for Wasco County Commissi 7438.95 329 7414.14 17295 Community United for Safety 7364.64 16889 YES on Oregon Safe Roads 7049.09 5446 6783.36 17001 Vote Yes on 90 6756.07 15145 6657.38 5328 6600.0 10 6500.0 11247 6377.54 306 6290.81 39 Oregon Trial Lawyers Association PAC 6162.5 17180 Friends and Neighbors for a Safe and Pro 6105.9 14509 6098.62 16541 Hayden for Oregon 6000.0 17277 Rask for School Board 5885.58 17343 Friends of Lars Hedbor 5830.9 15200 Friends of Jamie Damon 5697.66 17202 Save Columbia County Jobs 5564.26 16800 Oregon Anti-Crime Alliance PAC 5520.82 2352 5500.0 2480 5500.0 16426 Sal Peralta for Commissioner 5495.01 223 5267.9 17045 Fix Our Parks, a project of the Conserva 5232.8 4813 5170.0 17 5000.0 175 Oregon Nurseries Political Action Commit 5000.0 5667 Rogue Valley Association of Realtors Pol 5000.0 15683 5000.0 16988 Committee to Elect Leif for Douglas Coun 5000.0 338 Polk County Republican Central Committee 4751.98 16520 Friends of Tim Casey for Sheriff 4714.31 16480 Oregon Laborers Political Action Committ 4559.62 3428 4500.0 14961 4500.0 16537 Act Now for a Healthy Oregon Political C 4500.0 266 4427.63 17239 Millersburg Residents for a Responsible 4395.31 16559 Friends of Mark Richman 4210.75 17199 Committee to Restore, Revitalize and Reo 4179.4 15497 Friends of Steve Newgard 4066.4 16610 Protect Oregon Driver Licenses Committee 4007.8 7846 4002.11 14704 3989.15 13997 Friends of Lew Frederick 3942.8 312 Hood River County Democratic Central Com 3861.54 5203 3835.0 14592 3828.38 319 3806.89 15858 John Davis For Oregon 3772.27 5079 3771.08 15343 3759.0 379 3755.0 5208 3739.48 5580 3621.34 5089 Kevin Cameron for Oregon 3526.5 4745 Oregonians for Immigration Reform Politi 3500.0 15406 Elect Ellen Rosenblum for Attorney Gener 3500.0 17031 LBCC for the Community 3500.0 17397 Bud Pierce for a Better Oregon 3500.0 15 Association of Engineering Employees of 3483.69 16501 Friends of Robert Bruce 3450.0 16433 3281.93 26 Cable Operators Political Action Committ 3202.5 3865 3130.0 17247 Citizens for Marion County Extension 3015.57 11 3000.0 5410 Oregon Cattle PAC 3000.0 13571 Oregon Ambulatory Surgery Center Associa 3000.0 17150 Friends of Cloverdale Firefighters 3000.0 15621 2970.44 17334 Neighbors for a New Hospital 2850.0 299 Democratic Party of Coos County 2827.04 17285 Friends of Amy Kohnstamm 2765.0 7305 2600.0 17082 Beaverton Public Safety 2549.46 12786 2544.87 16196 We're for a Constitutional Government 2538.0 14377 Committee to Elect Loretta Smith 2515.0 1670 2500.0 4327 2500.0 5486 2500.0 14515 2500.0 16813 Grow PAC 2500.0 15872 Central Oregon Patriots Political Action 2455.42 325 Lane County Republican Central Committee 2382.37 113 2347.94 17077 Friends of Ray Eppley 2321.5 6104 2302.38 347 Wasco County Republican Central Committe 2263.25 15701 2249.83 3219 2240.76 618 Deschutes County Republican Central Comm 2229.69 4259 2143.21 17034 Anita Yap for PCC Board 2125.0 17057 Joe Buck For L.O. 2100.0 4572 2041.4 3604 2030.0 2307 2000.0 191 Oregon Realtors Political Action Committ 1970.03 15658 1925.29 303 1920.0 171 1898.49 16258 Friends of Diana Dickey 1890.0 101 1875.81 17067 Citizens for Improved and Safer Streets 1866.11 15519 1860.16 15348 Gomberg for State Rep 1850.0 17242 Friends of Andrew Davidson 1850.0 1686 1750.0 290 Baker County Republican Central Committe 1728.74 14049 1727.96 14984 1700.04 17158 Melton4pudboard 1690.0 4586 Oregon Federation of College Republicans 1689.5 17269 Truth In Sentencing 1680.0 11487 1668.85 16802 Oregonians for Safe Farms and Families 1656.43 16535 Elect Kerry Atherton 1648.91 17372 Friends of Trisha Claxton 1632.5 14407 1626.0 16927 Friends of James 2016 1623.2 13967 1622.2 6945 1600.0 17076 Jody Carson for City Council 1594.61 17128 Donna Lane Committee for City Council 1566.05 12493 1540.0 80 1520.0 7606 1500.0 15808 Coalition for a Healthy Oregon PAC 1500.0 15330 Friends of Jessica Vega Pederson 1494.1 17370 Youth Actvities and Academics Levy 1484.39 14608 1449.0 16812 Friends of Susan McLain 1426.8 16985 Friends of Andrew Kaza 1400.0 16434 1385.46 16538 Jodi Hack For Oregon 1383.05 17018 LaMear for Mayor 1377.23 294 1362.59 17352 Committee to Elect Lilia Caballero 1326.0 16728 Oregon GMO Right to Know 1284.0 17418 Fell for Parks 1250.42 353 1250.0 3073 Oregon Business Association PAC 1250.0 3420 1250.0 17423 Friends of Paul Carlos Southwick 1250.0 16972 Friends of Don Williams for Mayor 1232.1 16648 Scott Jackson For Sheriff 1229.26 324 Democratic Party of Lane County 1207.0 17065 OSPIRG Action 1197.65 4152 1180.0 4270 1174.25 16679 Stop the Bull Run Takeover PAC 1152.15 17119 Paul Anthony for Portland School Board 1150.0 14584 1100.0 17175 Free Our City 1100.0 14342 1064.84 335 Multnomah County Democratic Central Comm 1059.0 4792 1057.15 470 1050.0 15109 Novick for Portland 1030.0 16676 Runyan For Oregon 1021.0 16877 An Independent Voice for Oregon 1009.76 5709 1005.09 17028 Tracie Heidt for City Council 1005.0 16819 Friends of Daniel Benjamin 1000.01 3584 1000.0 4395 1000.0 5133 1000.0 10967 1000.0 15103 1000.0 16411 Stan Primozich for Yamhill County Commis 1000.0 16767 Nearman4Oregon 1000.0 348 Washington County Democratic Central Com 997.03 17121 Dennis Roler for City Council 984.07 17266 Friends of Julie Esparza Brown 976.07 169 965.98 16406 955.61 14531 950.0 17184 Price for PUD Board 920.0 17326 Friends of Christian Honl 900.0 17360 Batchelor for Board PAC 893.44 108 870.0 16936 Citizens for Neighborhood Livability 870.0 15628 Joel Ockunzzi for Jackson County Commiss 825.4 17342 Friends of Stephen Marc Beaudoin 819.7 16983 Ericksen For Oregon 813.0 14575 ChiroPAC 800.0 16757 Kathleen Taylor for Oregon 800.0 17106 Pam Ackley Campaign Account 800.0 17275 Friends of Renee for city council 795.0 4213 789.26 14183 762.16 1869 750.0 5885 750.0 16633 Committee to Elect Cathleen Callahan 750.0 16880 Lottery Local Control Committee 2016 750.0 17353 Becky Tymchuk for Beaverton School Board 750.0 17425 reelectbob 750.0 350 748.95 17407 Re-elect CTC 727.0 16624 Elect Judge Jean Marie Martwick 721.55 17328 Tina M French 709.08 16893 Laura Cooper for Rep 700.3 351 Yamhill County Republican Party 700.0 14324 700.0 5405 650.0 14242 650.0 17122 Friends of Karen Leach 646.0 5192 635.98 14277 615.0 16034 Barb for Bend 601.52 16732 Taylor For Sheriff 600.0 17257 Mike Rosen for Portland School Board 600.0 17349 Robert Keyser for Port Commission 600.0 5544 586.95 14892 570.0 5434 564.25 16938 Committee to Elect Curt Chancler for Cou 562.05 5585 Pete Sorenson Committee 550.0 16902 Clark for Mayor 550.0 15192 Buehler For a United Oregon 548.48 17261 Creswell for Affordable Water 538.09 3396 515.0 336 508.84 321 Klamath County Republican Central Commit 504.99 97 Mortgage Industry for Responsible Govern 500.0 161 Oregon Hospital Political Action Committ 500.0 228 500.0 317 500.0 1430 500.0 4846 500.0 5152 500.0 12631 500.0 12848 Douglas County Physicians PAC 500.0 13619 Oregon Aging & Disability Services Polit 500.0 13648 500.0 14223 500.0 16057 Oregon State Ass'n of Plumbing Heating C 500.0 16583 Friends of Bill Post 500.0 16680 Small Business PAC 500.0 16750 Vote Tom Brown County Commissioner 500.0 16753 Friends of Nick Caleb 500.0 16777 Elect Jackson Commissioner 500.0 16904 Friends of Rick Rose 500.0 16953 Humane Oregon PAC 500.0 17101 Lisa Seales for Bend City Council 500.0 328 488.62 291 469.99 17267 Elect Eileen Nittler 464.95 17096 Elect Tom Anderson 459.0 3477 450.0 16388 Support Local Food Rights 450.0 2109 446.84 3674 444.25 16907 Friends of Dan Chriestenson 411.5 12633 400.0 15237 Friends of Alissa Keny-Guyer 400.0 17090 Committee for Better Transit Now 400.0 17314 Support Our Sheriff 400.0 15298 399.8 16673 Friends of Kyle Allen 395.18 304 Deschutes County Democratic Central Comm 390.47 17151 Sustainable Hood River 390.0 17323 Lane Citizens for Safe Roads 388.87 2690 377.27 17335 Friends of Ivonne Gebhardt 357.42 16131 Communities of Color for a Just Oregon 320.0 3077 Oregon Independent Automobile Dealers As 315.0 6004 ArtPAC 313.48 10087 300.0 16616 Team BSW 295.35 14220 290.25 13966 Henry Heimuller for Columbia County Comm 287.74 14969 280.0 10367 The Good Government Council 276.13 16908 Taxpayer Defense Project PAC 275.0 283 Women's Investment Network PAC 270.58 16154 Portland Clean Air 266.54 302 Curry County Democratic Central Committe 257.01 16042 Scott Barbur for Milwaukie 256.51 16808 Lawrence for Judge 252.0 2803 250.0 2829 Oregon Refuse and Recycling Association 250.0 3917 250.0 12688 Committee to Elect Seth Crawford 250.0 14391 250.0 15704 250.0 16615 Schuster for Commissioner 250.0 16738 Ken Helm for House District 34 250.0 16910 Fair and Unified Elections Committee 238.93 16381 Building Tualatin's Future 231.14 14765 Friends of John Goodhouse 223.98 13436 221.25 17347 Public Employee Choice Act Committee 218.96 293 Clackamas County Democratic Central Comm 218.76 180 Oregon Physical Therapy PAC 201.0 216 200.0 320 Klamath County Democratic Central Commit 200.0 5458 200.0 5926 200.0 12943 200.0 15741 Oregon End Violence Against Women Politi 200.0 16548 Elect Craig Wilhelm 200.0 17125 Yes for Parks 3-451 200.0 17339 Benton County Citizens Opposed to Measur 200.0 17454 Citizens for a Healthy County 200.0 2278 Committee to Elect Jeff Kruse 199.0 17022 Vote Colt 196.88 16384 Citizens for Moving Damascus Forward 196.0 16761 Friends of Stephanie Nystrom 187.5 16814 Friends of Carla Piluso 180.78 15755 Campaign for Mark Webb 180.0 17174 Committee to Elect Wendy Engler 177.79 130 Oregon Certified Public Accountants Legi 170.61 16922 ElectRayK 170.0 16373 Help End Marijuana Prohibition in Oregon 165.0 16471 Ladd Wiles for Judge 150.0 16945 Friends of Gerritt 150.0 17073 YES For Neighbors and Livability 140.36 13561 140.0 147 135.05 16709 Citizens to Elect Carl Wilson 131.37 15659 Future of Racing in Oregon Group PAC 131.15 16999 Elect Jackie Pride for Tualatin 127.0 14252 Citizens for Dave Dotterrer 125.74 7665 125.0 17166 Freedom From Pesticides Alliance 125.0 5583 121.13 931 120.33 15469 Friends of Herman Baertschiger 113.18 221 110.92 14993 107.08 5783 105.0 12916 104.0 16780 Committee to Elect Darlene V. Taylor 102.0 13774 Family Physicians of Oregon PAC 100.0 13837 100.0 15684 100.0 16989 Committee to Recall Hales 100.0 17107 Committee to elect Michael Zarosinski 100.0 17415 Friends of Colleen Busch 100.0 16528 Jodie For Deschutes 97.75 17363 Committee to Elect Janet Lightheart 97.0 3215 96.45 14922 85.94 4741 Friends of Chip Shields 80.0 16454 75.52 16969 Sonnleitner for Zone 3 PCC 75.24 16543 Friends of John Turner 68.88 16459 Friends of Rob Nosse 60.0 14490 Oregon Technology PAC 55.98 318 Josephine County Democratic Central Comm 53.74 342 Umatilla County Republican Central Commi 50.0 14523 The Mother PAC 50.0 16825 Barrett for Clatsop County Commission #5 50.0 17308 Elect Kent Norris 33.7 15523 Committee to Elect Jan Giunta 30.37 16168 30.0 292 25.0 4100 Bar Pilots of the Columbia River Politic 25.0 17264 Benton Food Freedom 20.99 17387 Supporters of Chelsea Martin 20.0 4824 18.99 145 18.26 247 SalPAC 17.5 3698 10.0 16407 8.0 622 Libertarian Party of Oregon 0.43 6106 0.33
Let's build a graph of the similarity between PACs based on the wording of their committee names
import matplotlib
%matplotlib inline
np = pd.np
np.norm = np.linalg.norm
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
df = pacs_scraped
names = df.index.values
corpus = [' '.join(str(f) for f in fields) for fields in zip(*[df[col] for col in df.columns if df[col].dtype == pd.np.dtype('O')])]
print(corpus[:3])
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), stop_words='english')
tfidf = vectorizer.fit_transform(corpus)
cov = tfidf * tfidf.T
cov[0:]
['GCC Miscellaneous 01/03/2014 to present Amendment 435 W First Ave P.O. Box 548 Albany, OR 97321 (541)926-1517 Janet Steele 435 1st Avenue W. Albany, OR 97321 (541)926-1517 (541)979-2395 (541)926-7064 jsteele@albanychamber.com nan nan nan nan nan nan nan nan nan nan nan nan PAC', 'nan nan 11/29/2012 to 11/29/2012 Discontinuation 28356 SW Wagner Street Wilsonville, OR 97070 (503)685-7346 Carol A. Russell 89358 Cranberry Lane Bandon, OR 97411 (800)893-1005 (541)347-4423 (800)967-7380 russell.cna@gmail.com Matt Wingard 2012 Primary Election State Representative, 26th District Republican 28356 SW Wagner St. Wilsonville, OR 97070 (503)351-2955 28356 SW Wagner St. Wilsonville, OR 97070 jobs_and_power@yahoo.com Public Relations Beambo, Inc., Wilsonville, OR nan nan nan CC', 'nan nan 04/30/2013 to present Amendment 3321 SE 20th Ave Portland, OR 97202 nan Kevin F. Neely PO Box 42307 Portland, OR 97242 (503)295-1851 kevin.neely@c-esystems.com John Kroger 2012 Primary Election Attorney General Democrat 9530 SW 12th Dr. Portland, OR 97219 nan PO Box 42307 Portland, OR 97242 justinstjames@gmail.com Attorney General State of Oregon Department of Justice, Salem, OR nan nan nan CC']
<1415x1415 sparse matrix of type '<type 'numpy.float64'>' with 2002225 stored elements in Compressed Sparse Row format>