#!/usr/bin/env python # coding: utf-8 # # Late contributions Received and Made # ## Setup # In[2]: get_ipython().run_line_magic('load_ext', 'sql') # In[3]: from django.conf import settings connection_string = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format( **settings.DATABASES['default'] ) get_ipython().run_line_magic('sql', '$connection_string') # ## Unique Composite Key # The documentation says that the records are unique on the following fields: # * `FILING_ID` # * `AMEND_ID` # * `LINE_ITEM` # * `REC_TYPE` # * `FORM_TYPE` # # `REC_TYPE` is always the same value: `S497`, so we can ignore this column. # # `FORM_TYPE` is either `F497P1` or `F497P2`, indicating in whether itemized transaction is listed under Part 1 (Contributions Received) or Part 2 (Contributions Made). I'll split these up into separate tables. # ## Are the `S497_CD` records actually unique on `FILING_ID`, `AMEND_ID` and `LINE_ITEM`? # Yes. And this is even true across the Parts 1 and 2 (Contributions Received and Contributions Made). # In[11]: get_ipython().run_cell_magic('sql', '', 'SELECT "FILING_ID", "AMEND_ID", "LINE_ITEM", COUNT(*)\nFROM "S497_CD"\nGROUP BY 1, 2, 3\nHAVING COUNT(*) > 1\nORDER BY COUNT(*) DESC;\n') # ## `TRAN_ID` # The `S497_CD` table includes a `TRAN_ID` field, which the [documentation](http://calaccess.californiacivicdata.org/documentation/calaccess-files/s497-cd/#fields) describes as a "Permanent value unique to this item". # ### Is `TRAN_ID` ever `NULL` or blank? # No. # In[10]: get_ipython().run_cell_magic('sql', '', 'SELECT COUNT(*)\nFROM "S497_CD"\nWHERE "TRAN_ID" IS NULL OR "TRAN_ID" = \'\' OR "TRAN_ID" = \'0\';\n') # ### Is `TRAN_ID` unique across filings? # Decidedly no. # In[3]: get_ipython().run_cell_magic('sql', '', 'SELECT "TRAN_ID", COUNT(DISTINCT "FILING_ID") \nFROM "S497_CD"\nGROUP BY 1\nHAVING COUNT(DISTINCT "FILING_ID") > 1\nORDER BY COUNT(DISTINCT "FILING_ID") DESC\nLIMIT 100;\n') # But `TRAN_ID` does appear to be unique within each filing amendment, and appears to be reused for each filing. # In[7]: get_ipython().run_cell_magic('sql', '', 'SELECT "FILING_ID", "TRAN_ID", COUNT(DISTINCT "AMEND_ID") AS amend_count, COUNT(*) AS row_count\nFROM "S497_CD"\nGROUP BY 1, 2\nORDER BY COUNT(*) DESC\nLIMIT 100;\n') # There's one exception: # In[8]: get_ipython().run_cell_magic('sql', '', 'SELECT "FILING_ID", "TRAN_ID", "AMEND_ID", COUNT(*)\nFROM "S497_CD"\nGROUP BY 1, 2, 3\nHAVING COUNT(*) > 1;\n') # Looks like this `TRAN_ID` is duplicated across the two parts of the filing. So it was both a contribution both made and received? # In[5]: get_ipython().run_cell_magic('sql', '', 'SELECT *\nFROM "S497_CD"\nWHERE "FILING_ID" = 2072379\nAND "TRAN_ID" = \'EXP9671\';\n') # Looking at the [PDF for the filing](http://cal-access.ss.ca.gov/PDFGen/pdfgen.prg?filingid=2072379&amendid=1), it appears to be a check from the California Psychological Association PAC to the McCarty for Assembly 2016 committee, which was given and returned on 8/25/2016. # # Regardless, because the combinations of `FILING_ID`, `AMEND_ID` and `TRAN_ID` are unique within each part of the Schedule 497, we could substitute `TRAN_ID` for `LINE_ITEM` in the composite key when splitting up the contributions received from the contributions made. # # The advantage is that the `TRAN_ID` purportedly points to the same contribution from one amendment to the next, whereas the same `LINE_ITEM` might not because the filers don't necessarily list transactions on the same line from one filing amendment to the next. # # Here's an example: On the [original Schedule 497 filing](http://cal-access.ss.ca.gov/PDFGen/pdfgen.prg?filingid=2083478&amendid=0) for Steven Bradford for Senate 2016, a $8,500.00 contribution from an AFL-CIO sub-committee is listed on line 1. But on the [first](http://cal-access.ss.ca.gov/PDFGen/pdfgen.prg?filingid=2083478&amendid=1) and [second](http://cal-access.ss.ca.gov/PDFGen/pdfgen.prg?filingid=2083478&amendid=2) amendments to the filing, it is listed on line 4. #