#!/usr/bin/env python # coding: utf-8 # ### Load Packages # In[440]: 3522 + 35 + 1032 + 62 + 206 # In[1]: import sys import time import json # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import sqlite3 as lite from pandas import DataFrame from pandas import Series # In[3]: #http://pandas.pydata.org/pandas-docs/stable/options.html pd.set_option('display.max_columns', None) pd.set_option('max_colwidth', 250) # In[92]: from __future__ import division # #### Set working directory # In[4]: cd '/Users/gregorysaxton/Google Drive/SOX' # ### Create List of EINs for '2016' charities -- I'll use this to create a list of EINs for matching with e-file database # - Previous (Read in Saved List of EINs with 'current' (2016) donor advisories) # - What I'm doing here is reading in the Stata 'Table 5' dataset so that I can create a list of the 8,304 relevant EINs. # In[7]: #f = open('2016 donor advisory EINs.json', 'r') #advisories_2016 = json.load(f) #print len(advisories_2016) #print advisories_2016[:10] # In[6]: #df = pd.read_pickle('2016 - Test 4 data.pkl') #print "Number of columns:", len(df.columns) #print "Number of observations:", len(df) #df.head(1) # In[5]: #print len(set(df['EIN'].tolist())) #eins_2016 = df['EIN'].tolist() #print len(eins_2016), len(set(eins_2016)) #print eins_2016[:5] # #### Read in Test 4 data # In[393]: df = pd.read_pickle('2016 - Test 4 data.pkl') print "Number of columns:", len(df.columns) print "Number of observations:", len(df) df.head(1) # In[23]: print len(set(df['EIN'].tolist())) eins_2016 = df['EIN'].tolist() print len(eins_2016), len(set(eins_2016)) print eins_2016[:5] # In[ ]: print len(eins_2016) with open('eins_2016.json', 'w') as outfile: json.dump(eins_2016, outfile) # In[ ]: #import json #with open('valid_next_v8.json', 'r') as fp: # valid_next = json.load(fp) #print len(valid_next) #print valid_next[:5] # # Read DB into PANDAS DF # - I'm taking all 29,440 990 e-filings for our 8,304 EINs and importing them from MongoDB into a Python PANDAS dataframe for manipulation. # - As you can see if you scroll horizontally through this sample row, there are 691 columns! # In[8]: df_all = pd.read_pickle('e-file data for 7,133 of 8,304 2016 EINs.pkl') print '# of columns:', len(df_all.columns) print '# of observations:', len(df_all) df_all.head(1) # ## Identify Governance Variables # In[17]: vars = pd.read_pickle('variable descriptions 2010-2015 990 e-file data (first year only).pkl') print '# of columns:', len(vars.columns) print '# of observations:', len(vars) vars.head(1) # ##### Independent and voting directors # In[37]: cols_2013 = ['GoverningBodyVotingMembersCnt', 'IndependentVotingMemberCnt', 'VotingMembersGoverningBodyCnt', 'VotingMembersIndependentCnt'] cols_2010 = ['NbrIndependentVotingMembers', 'NbrVotingGoverningBodyMembers', 'NbrVotingMembersGoverningBody', 'NumberIndependentVotingMembers'] df_all[['FYE', 'EIN'] + cols_2010+cols_2013][:20] # In[43]: print len(df_all[(df_all['IndependentVotingMemberCnt'].notnull())]) print len(df_all[(df_all['VotingMembersIndependentCnt'].notnull())]) print len(df_all[(df_all['IndependentVotingMemberCnt'].isnull()) & (df_all['VotingMembersIndependentCnt'].notnull())]) print len(df_all[(df_all['VotingMembersIndependentCnt'].isnull()) & (df_all['IndependentVotingMemberCnt'].notnull())]), '\n' print len(df_all[(df_all['GoverningBodyVotingMembersCnt'].isnull()) & (df_all['VotingMembersGoverningBodyCnt'].notnull())]) print len(df_all[(df_all['VotingMembersGoverningBodyCnt'].isnull()) & (df_all['GoverningBodyVotingMembersCnt'].notnull())]) #
Based on the above we can drop the first two for each year -- we'll go with the Section I, Line 3 and 4 versions, which seem to be the same # In[242]: cols = ['EIN', 'FYE'] #cols_2013 = ['VotingMembersGoverningBodyCnt', 'VotingMembersIndependentCnt'] #cols_2010 = ['NbrIndependentVotingMembers', 'NbrVotingMembersGoverningBody'] cols = cols + ['VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers'] print cols # ##### Outsourced Management # In[243]: cols = cols + ['DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties'] print cols # ##### IRS 990 Review # In[244]: cols = cols + ['Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody'] print cols # ##### Audit Committee # In[245]: cols = cols + ['AuditCommitteeInd', 'AuditCommittee'] print cols # ##### Donor Restrictions # In[54]: 7351259+4433997+11904841 #
What we see here is that *TemporarilyRstrNetAssetsGrp* + *PermanentlyRstrNetAssetsGrp* + *UnrestrictedNetAssetsGrp* = *TotalNetAssetsFundBalanceGrp* = *NetAssetsOrFundBalancesEOYAmt* # # - Given that *TotalNetAssetsFundBalanceGrp* is a 'group' variable, I should gather EOY values from the first two variables, take the sum, and then divide by *NetAssetsOrFundBalancesEOYAmt* in order to get a *DONOR RESTRICTIONS* VARIABLE SIMILAR TO YETMAN & YETMAN # In[53]: asset_cols = ['TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', 'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets', 'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances', 'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY'] df_all[['EIN', 'FYE']+asset_cols][:10] # In[62]: #cols = cols + ['TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', #'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets', #'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances', #'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY'] #print cols # In[247]: print df_all.columns.tolist()[-20:] # In[249]: cols = cols + ['perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions'] # ##### Tax-Exempt Bonds (municipal bonds) # In[250]: cols = cols + ['TaxExemptBondsInd', 'TaxExemptBonds'] print cols # ##### Taxable Revenues # In[251]: cols = cols + ['NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome'] print cols # ##### Audited Financials and Federal Audits # In[69]: #df_all['AuditedFinancialStmtAttInd'].value_counts() # In[76]: audit_cols = [#'AuditedFinancialStmtAttInd', #'ConsolidatedAuditFinclStmtInd', 'ConsolidatedAuditFinancialStmt', 'FSAuditedInd', 'FSAudited', #'FSAuditedBasisGrp', 'FSAuditedBasis', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', ] df_all[['EIN', 'FYE']+ audit_cols][:8] # In[252]: cols = cols + ['FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', ] print cols # #### Create *Donor Restrictions* variable as in Yetman & Yetman (2012) # I NEED TO DO IT *BEFORE* COLLAPSING -- OTHERWISE IT WILL CAUSE PROBLEMS FOR AN EIN THAT HAS NO PERMANENTLY RESTRICTED ASSETS IN ONE YEAR, SAY, BUT IT DOES THE NEXT -- THE GROUPBY FUNCTION WILL BRING THOSE INTO THE SAME ROW. # ##### Donor Restrictions -- Part (a) - Permanently Restricted Assets # In[199]: from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING) import timeit start_time = timeit.default_timer() for index, row in df_all[:].iterrows(): perm_rest = 0 if pd.notnull(row['PermanentlyRstrNetAssetsGrp']): #print 'not null!' if 'EOYAmt' in row['PermanentlyRstrNetAssetsGrp']: perm_rest = row['PermanentlyRstrNetAssetsGrp']['EOYAmt'] elif 'BOYAmt' in row['PermanentlyRstrNetAssetsGrp']: perm_rest = row['PermanentlyRstrNetAssetsGrp']['BOYAmt'] elif pd.notnull(row['PermanentlyRestrictedNetAssets']): print 'going to second variable' if 'EOY' in row['PermanentlyRestrictedNetAssets']: perm_rest = row['PermanentlyRestrictedNetAssets']['EOY'] elif 'BOY' in row['PermanentlyRestrictedNetAssets']: perm_rest = row['PermanentlyRestrictedNetAssets']['BOY'] df_all.ix[index, 'perm_rest_assets'] = perm_rest clear_output() print ('Index: ', index, 'Perm. Restr Assets:', perm_rest), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' print df_all['perm_rest_assets'].value_counts().sum() # In[200]: print len(df_all[df_all['perm_rest_assets'].notnull()]) print len(df_all[df_all['perm_rest_assets'].isnull()]) print df_all['perm_rest_assets'].describe(), '\n' df_all['perm_rest_assets'] = df_all['perm_rest_assets'].astype('int') print df_all['perm_rest_assets'].describe() # ##### Donor Restrictions -- Part (b) - Temporarily Restricted Assets # In[201]: from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING) import timeit start_time = timeit.default_timer() for index, row in df_all[:].iterrows(): temp_rest = 0 if pd.notnull(row['TemporarilyRstrNetAssetsGrp']): #print 'not null!' if 'EOYAmt' in row['TemporarilyRstrNetAssetsGrp']: temp_rest = row['TemporarilyRstrNetAssetsGrp']['EOYAmt'] elif 'BOYAmt' in row['TemporarilyRstrNetAssetsGrp']: temp_rest = row['TemporarilyRstrNetAssetsGrp']['BOYAmt'] elif pd.notnull(row['TemporarilyRestrictedNetAssets']): print 'going to second' if 'EOY' in row['TemporarilyRestrictedNetAssets']: temp_rest = row['TemporarilyRestrictedNetAssets']['EOY'] elif 'BOY' in row['TemporarilyRestrictedNetAssets']: temp_rest = row['TemporarilyRestrictedNetAssets']['BOY'] df_all.ix[index, 'temp_rest_assets'] = temp_rest clear_output() print ('Index: ', index, 'Temp. Restr Assets:', temp_rest), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' print df_all['temp_rest_assets'].value_counts().sum() # In[202]: print len(df_all[df_all['temp_rest_assets'].notnull()]) print len(df_all[df_all['temp_rest_assets'].isnull()]) print df_all['temp_rest_assets'].describe(), '\n' df_all['temp_rest_assets'] = df_all['temp_rest_assets'].astype('int') print df_all['temp_rest_assets'].describe() # ##### Donor Restrictions -- Part (c) - Total Assets # In[203]: df_all['net_assets'] = df_all['NetAssetsOrFundBalancesEOYAmt'] df_all['net_assets'] = np.where(df_all['net_assets'].isnull(), df_all['NetAssetsOrFundBalancesEOY'], df_all['net_assets']) print df_all['net_assets'].describe(), '\n' df_all['net_assets'] = df_all['net_assets'].astype('int') print df_all['net_assets'].describe() # ##### Donor Restrictions -- Part (d) - Calculations # In[204]: df_all['donor_restrictions'] = (df_all['temp_rest_assets']+df_all['perm_rest_assets'])/df_all['net_assets'] df_all['donor_restrictions'].describe() # In[205]: df_all.dtypes[-5:] #
THERE ARE 40 MISSING CASES -- ASSIGN THEM A VALUE OF ZERO # In[207]: print len(df_all[df_all['donor_restrictions'].isnull()]) # In[208]: df_all['donor_restrictions'] = np.where(df_all['donor_restrictions'].isnull(), 0, df_all['donor_restrictions']) df_all['donor_restrictions'].describe() # In[209]: print len(df_all[df_all['donor_restrictions'].isnull()]) #
CHECK A FEW >1 ROWS # In[221]: df_all[['FYE', 'donor_restrictions', 'temp_rest_assets', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets', 'perm_rest_assets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', 'net_assets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY', #'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets', 'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances', #'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']][-6:] # In[211]: df_all[['FYE', 'donor_restrictions', 'temp_rest_assets', 'perm_rest_assets', 'net_assets']][-6:] # In[224]: print len(df_all[df_all['donor_restrictions']>.99]) print len(df_all[df_all['donor_restrictions']>1]) df_all[df_all['donor_restrictions']>1][['FYE', 'donor_restrictions', 'temp_rest_assets', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets', 'perm_rest_assets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', 'net_assets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY', #'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets', 'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances', #'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']][-6:] # ##### Fees for Services (proxy for audit) # In[228]: df_all['FeesForServicesAccountingGrp_v2'] = np.where(df_all['FeesForServicesAccountingGrp'].notnull(), df_all['FeesForServicesAccountingGrp'], df_all['FeesForServicesAccounting']) print len(df_all[df_all['FeesForServicesAccountingGrp_v2'].isnull()]), '\n' print len(df_all[df_all['FeesForServicesAccountingGrp_v2'].notnull()]), '\n' # In[229]: for index, row in df_all[df_all['FeesForServicesAccountingGrp_v2'].notnull()][:].iterrows(): #print type(row['FeesForServicesAccountingGrp_v2']), row['FeesForServicesAccountingGrp_v2'] if 'Total' in row['FeesForServicesAccountingGrp_v2']: #print 'THERE IS "Total"', row['FeesForServicesAccountingGrp_v2']['Total'] df_all.ix[index, 'FeesForServicesAccounting_TotalAmt'] = row['FeesForServicesAccountingGrp_v2']['Total'] if 'TotalAmt' in row['FeesForServicesAccountingGrp_v2']: #print 'THERE IS "TotalAmt"', row['FeesForServicesAccountingGrp_v2']['TotalAmt'] df_all.ix[index, 'FeesForServicesAccounting_TotalAmt'] = row['FeesForServicesAccountingGrp_v2']['TotalAmt'] clear_output() print ('Index: ', index, 'Fees:', df_all.ix[index, 'FeesForServicesAccounting_TotalAmt']), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' df_all['FeesForServicesAccounting_TotalAmt'].value_counts().sum() # In[230]: df_all['FeesForServicesAccounting_TotalAmt'] = df_all['FeesForServicesAccounting_TotalAmt'].astype('float') # In[231]: df_all['FeesForServicesAccounting_binary'] = np.where(df_all['FeesForServicesAccounting_TotalAmt']>0, 1,0) df_all['FeesForServicesAccounting_binary'].value_counts() # In[232]: print '# of rows where fees less than 0:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']<0]), '\n' print '# of rows where fees equal zero:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']==0]), '\n' print '# of rows where fees greater than zero:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']>0]), '\n' print '# of rows missing values:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt'].isnull()]), '\n' # In[233]: print '# of rows that should have value of zero:', 4+1274+3723 # In[236]: df_all[df_all['FeesForServicesAccounting_TotalAmt']<0][:][['FeesForServicesAccounting_TotalAmt']] # In[237]: df_all['FeesForServicesAccounting_binary'].value_counts() # In[254]: cols = cols + ['FeesForServicesAccounting_binary'] print cols # In[287]: print len(df_all[df_all['donor_restrictions']>100]) df_all[df_all['donor_restrictions']>100][cols] # ##### Family Business Relations # In[443]: cols = cols + ['FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship'] print cols # ##### Tax Exempt Bonds, Take II # In[444]: cols = cols + ['TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities'] print cols # ##### Government Grants # In[445]: cols = cols + ['GovernmentGrantsAmt', 'GovernmentGrants'] print cols # ##### Election of Board Members # In[446]: cols = cols + ['ElectionOfBoardMembersInd', 'ElectionOfBoardMembers'] print cols # ##### Zero Fundraising # In[551]: cols = cols + ['CYTotalFundraisingExpenseAmt', 'TotalFundrsngExpCurrentYear'] print cols # ##### Save DF # In[255]: len(df_all) # In[256]: df_all.to_pickle('e-file data for 7,133 of 8,304 2016 EINs (n=24940).pkl') # ### Collapse Dataset -- take last # In[447]: df_all = df_all.sort_values(by=['EIN', 'FYE'], ascending=[1,0]) df_all[:5][cols] # In[259]: df_all[-3:][cols] # In[552]: print len(df_all[cols].groupby('EIN').agg('first')) governance = df_all[cols].groupby('EIN').agg('first') print len(governance) governance = governance.reset_index() governance[:5] # In[449]: governance['FYE'].value_counts() # ### Create combined variables # In[262]: print cols # ##### Independent Directors # In[450]: governance['independent_directors_num'] = governance['VotingMembersIndependentCnt'] governance['independent_directors_num'] = np.where(governance['independent_directors_num'].isnull(), governance['NbrIndependentVotingMembers'], governance['independent_directors_num']) print len(governance[governance['independent_directors_num'].isnull()]) #print governance['independent_directors_num'].value_counts() governance['independent_directors_num'] = governance['independent_directors_num'].astype('int') governance['independent_directors_num'].describe() # In[451]: governance[governance['VotingMembersIndependentCnt'].isnull()][:2] # ##### Voting Directors # In[452]: governance['voting_directors'] = governance['VotingMembersGoverningBodyCnt'] governance['voting_directors'] = np.where(governance['voting_directors'].isnull(), governance['NbrVotingMembersGoverningBody'], governance['voting_directors']) print len(governance[governance['voting_directors'].isnull()]) #print governance['voting_directors'].value_counts() governance['voting_directors'] = governance['voting_directors'].astype('int') # ##### Percent Independent Directors # In[457]: governance['independent_directors_pct'] = governance['independent_directors_num']/governance['voting_directors'] governance['independent_directors_pct'].describe() # In[456]: governance[governance['independent_directors_pct'].isnull()] # In[458]: governance['independent_directors_pct'] = np.where(governance['independent_directors_pct'].isnull(), 0, governance['independent_directors_pct']) governance['independent_directors_pct'].describe() #
Second version -- none over 100% # In[461]: print len(governance[governance['independent_directors_pct']>1]) governance['independent_directors_pct_v2'] = np.where(governance['independent_directors_pct']>1, 1, governance['independent_directors_pct']) print len(governance[governance['independent_directors_pct_v2']>1]) print governance['independent_directors_pct_v2'].describe() # ##### Outsourced Management # In[463]: governance['outsourced_mgt'] = governance['DelegationOfMgmtDutiesInd'] governance['outsourced_mgt'] = np.where(governance['outsourced_mgt'].isnull(), governance['DelegationOfManagementDuties'], governance['outsourced_mgt']) print len(governance[governance['outsourced_mgt'].isnull()]) print governance['outsourced_mgt'].value_counts() # In[464]: governance['outsourced_mgt'] = np.where(governance['outsourced_mgt']=='true', 1, governance['outsourced_mgt']) governance['outsourced_mgt'] = np.where(governance['outsourced_mgt']=='false', 0, governance['outsourced_mgt']) print governance['outsourced_mgt'].value_counts() # In[465]: governance['outsourced_mgt']=governance['outsourced_mgt'].astype('int') print governance['outsourced_mgt'].value_counts() # ##### IRS 990 Review # In[466]: governance['990_review'] = governance['Form990ProvidedToGvrnBodyInd'] governance['990_review'] = np.where(governance['990_review'].isnull(), governance['Form990ProvidedToGoverningBody'], governance['990_review']) print len(governance[governance['990_review'].isnull()]) print governance['990_review'].value_counts() # In[467]: governance['990_review'] = np.where(governance['990_review']=='true', 1, governance['990_review']) governance['990_review'] = np.where(governance['990_review']=='false', 0, governance['990_review']) governance['990_review'] = governance['990_review'].astype('int') print governance['990_review'].value_counts() # ##### Audit Committee # I'M ASSIGNING A VALUE OF '0' TO THE 138 CASES MISSING VALUES # In[468]: governance['audit_committee'] = governance['AuditCommitteeInd'] governance['audit_committee'] = np.where(governance['audit_committee'].isnull(), governance['AuditCommittee'], governance['audit_committee']) print len(governance[governance['audit_committee'].isnull()]) print governance['audit_committee'].value_counts() # In[469]: governance['audit_committee'] = np.where(governance['audit_committee']=='true', 1, governance['audit_committee']) governance['audit_committee'] = np.where(governance['audit_committee']=='false', 0, governance['audit_committee']) governance['audit_committee'] = np.where(governance['audit_committee'].isnull(), 0, governance['audit_committee']) governance['audit_committee']=governance['audit_committee'].astype('int') print governance['audit_committee'].value_counts() # In[470]: governance.describe().T # ### Donor Restrictions # - NOTE THAT I'M NOT DOING IT THIS WAY ANY MORE -- NOW IT IS DONE ABOVE IN *df_all* BEFORE COLLAPSING THE DATA # - ALSO NOTE THAT THE 990 DATA MUST HAVE SOME ERRORS BECAUSE THERE ARE A FEW CASES WITH VERY HIGH VALUES ON *DONOR_RESTRICTIONS* # In[471]: #print governance['donor_restrictions'].min() #governance['donor_restrictions'].max() # In[473]: print len(governance[governance['donor_restrictions']>100]) governance[governance['donor_restrictions']>100] # In[472]: print len(governance[governance['donor_restrictions']>10]) # In[290]: governance[governance['donor_restrictions']>10] # In[474]: governance['donor_restrictions'] = np.where(governance['donor_restrictions']>100, 0, governance['donor_restrictions']) governance['donor_restrictions'].describe() # ##### Donor Restrictions (binary) - as in Harris et al. # In[476]: governance[:1] # In[479]: governance['restricted_donations'] = np.where(((governance['perm_rest_assets']>1)| (governance['temp_rest_assets']>1)), 1,0) governance['restricted_donations'].value_counts() # In[491]: governance[['perm_rest_assets', 'temp_rest_assets', 'restricted_donations']][20:25] # ##### Donor Restrictions -- Part (a) - Temporarily Restricted Assets # In[123]: ''' print len(governance[governance['TemporarilyRstrNetAssetsGrp'].isnull()]) #, 'TemporarilyRestrictedNetAssets', #'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', #'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY', ''' # In[291]: #governance[267:270] # In[140]: ''' from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING) import timeit start_time = timeit.default_timer() for index, row in governance[:].iterrows(): if pd.notnull(row['TemporarilyRstrNetAssetsGrp']): #print 'not null!' if 'EOYAmt' in row['TemporarilyRstrNetAssetsGrp']: temp_rest = row['TemporarilyRstrNetAssetsGrp']['EOYAmt'] elif 'BOYAmt' in row['TemporarilyRstrNetAssetsGrp']: temp_rest = row['TemporarilyRstrNetAssetsGrp']['BOYAmt'] elif pd.notnull(row['TemporarilyRestrictedNetAssets']): print 'going to second' if 'EOY' in row['TemporarilyRestrictedNetAssets']: temp_rest = row['TemporarilyRestrictedNetAssets']['EOY'] elif 'BOY' in row['TemporarilyRestrictedNetAssets']: temp_rest = row['TemporarilyRestrictedNetAssets']['BOY'] governance.ix[index, 'temp_rest_assets'] = temp_rest clear_output() print ('Index: ', index, 'Temp. Restr Assets:', temp_rest), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' print governance['temp_rest_assets'].value_counts().sum() ''' # In[146]: ''' print len(governance[governance['temp_rest_assets'].notnull()]) print len(governance[governance['temp_rest_assets'].isnull()]) print governance['temp_rest_assets'].describe() governance['temp_rest_assets'] = governance['temp_rest_assets'].astype('int') print governance['temp_rest_assets'].describe() ''' # ##### Donor Restrictions -- Part (b) - Permanently Restricted Assets # In[292]: #governance[:2] # In[149]: ''' print len(governance[governance['PermanentlyRestrictedNetAssets'].notnull()]) print len(governance[governance['PermanentlyRestrictedNetAssets'].isnull()]) print len(governance[governance['PermanentlyRstrNetAssetsGrp'].notnull()]) print len(governance[governance['PermanentlyRstrNetAssetsGrp'].isnull()]) ''' # In[151]: ''' from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING) import timeit start_time = timeit.default_timer() for index, row in governance[:].iterrows(): if pd.notnull(row['PermanentlyRstrNetAssetsGrp']): #print 'not null!' if 'EOYAmt' in row['PermanentlyRstrNetAssetsGrp']: perm_rest = row['PermanentlyRstrNetAssetsGrp']['EOYAmt'] elif 'BOYAmt' in row['PermanentlyRstrNetAssetsGrp']: perm_rest = row['PermanentlyRstrNetAssetsGrp']['BOYAmt'] elif pd.notnull(row['PermanentlyRestrictedNetAssets']): print 'going to second variable' if 'EOY' in row['PermanentlyRestrictedNetAssets']: perm_rest = row['PermanentlyRestrictedNetAssets']['EOY'] elif 'BOY' in row['PermanentlyRestrictedNetAssets']: perm_rest = row['PermanentlyRestrictedNetAssets']['BOY'] governance.ix[index, 'perm_rest_assets'] = perm_rest clear_output() print ('Index: ', index, 'Perm. Restr Assets:', perm_rest), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' print governance['perm_rest_assets'].value_counts().sum() ''' # In[153]: ''' print len(governance[governance['perm_rest_assets'].notnull()]) print len(governance[governance['perm_rest_assets'].isnull()]) print governance['perm_rest_assets'].describe(), '\n' governance['perm_rest_assets'] = governance['perm_rest_assets'].astype('int') print governance['perm_rest_assets'].describe() ''' # ##### Donor Restrictions -- Part (c) - Total Assets # In[157]: ''' governance['net_assets'] = governance['NetAssetsOrFundBalancesEOYAmt'] governance['net_assets'] = np.where(governance['net_assets'].isnull(), governance['NetAssetsOrFundBalancesEOY'], governance['net_assets']) print governance['net_assets'].describe(), '\n' governance['net_assets'] = governance['net_assets'].astype('int') print governance['net_assets'].describe() ''' # ##### Donor Restrictions -- Part (d) - Calculations # In[159]: ''' governance['donor_restrictions'] = (governance['temp_rest_assets']+governance['perm_rest_assets'])/governance['net_assets'] governance['donor_restrictions'].describe() ''' # In[162]: ''' governance['donor_restrictions'] = np.where(governance['donor_restrictions'].isnull(), 0, governance['donor_restrictions']) governance['donor_restrictions'].describe() ''' # In[169]: ''' df_all[df_all['EIN']=='010287624'][['FYE', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', 'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets', 'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances', 'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']] ''' # In[171]: #(718875+0)/(718875+1112268) # In[174]: #print (718875+1112268)-1831143 #print 1831143 - (718875+1112268) # ##### Tax-Exempt Bond # In[492]: from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING) import timeit start_time = timeit.default_timer() for index, row in governance[:].iterrows(): bond = 0 if pd.notnull(row['TaxExemptBondsInd']): #print 'not null!' if '#text' in row['TaxExemptBondsInd']: bond = row['TaxExemptBondsInd']['#text'] else: bond = row['TaxExemptBondsInd'] elif pd.notnull(row['TaxExemptBonds']): if '#text' in row['TaxExemptBonds']: bond = row['TaxExemptBonds']['#text'] else: bond = row['TaxExemptBonds'] governance.ix[index, 'tax_exempt_bond'] = bond clear_output() print ('Index: ', index, 'Tax-exempt bond:', bond), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' print governance['tax_exempt_bond'].value_counts().sum() # In[493]: print governance['tax_exempt_bond'].value_counts(), '\n' governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond']=='true', 1, governance['tax_exempt_bond']) governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond']=='false', 0, governance['tax_exempt_bond']) governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond'].isnull(), 0, governance['tax_exempt_bond']) governance['tax_exempt_bond'] = governance['tax_exempt_bond'].astype('int') print governance['tax_exempt_bond'].value_counts() # ##### Tax-Exempt Bonds, v2 (Harris et al.) # NO NEED TO DO THIS -- MY VARIABLE ALREADY TAPS IT # In[501]: governance[['TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities', 'tax_exempt_bond']][2:4] # ##### Taxable Revenues # AS IN YETMAN AND YETMAN'S *TAXABLE REVENUE* VARIABLE -- A REGULATORY OVERSIGHT VARIABLE, GIVEN THAT IT IS RELATED TO AUDIT LIKELIHOOD. # In[502]: governance['taxable_revenue'] = governance['NetUnrelatedBusTxblIncmAmt'] governance['taxable_revenue'] = np.where(governance['taxable_revenue'].isnull(), governance['NetUnrelatedBusinessTxblIncome'], governance['taxable_revenue']) print len(governance[governance['taxable_revenue'].isnull()]) print governance['taxable_revenue'].describe(), '\n' governance['taxable_revenue'] = np.where(governance['taxable_revenue'].isnull(), 0, governance['taxable_revenue']) print governance['taxable_revenue'].describe(), '\n' governance['taxable_revenue'] = governance['taxable_revenue'].astype('int') print governance['taxable_revenue'].describe(), '\n' #
CREATE TWO BINARY VERSION -- ONE FOR TAXABLE REVENUE GREATER THAN ZERO AND ONE FOR TAXABLE REVENUE OTHER THAN ZERO # In[503]: print len(governance[governance['taxable_revenue']>0]) print len(governance[governance['taxable_revenue']==0]) print len(governance[governance['taxable_revenue']<0]) governance['taxable_revenue_binary'] = np.where(governance['taxable_revenue']>0, 1,0) print governance['taxable_revenue_binary'].value_counts(), '\n' governance['taxable_revenue_neg_or_pos_rev_binary'] = np.where( ((governance['taxable_revenue']>0) | (governance['taxable_revenue']<0)), 1,0) print governance['taxable_revenue_neg_or_pos_rev_binary'].value_counts() # In[504]: governance[:1] # ##### Audited Financials # In[505]: governance['audited_financials'] = np.nan print len(governance[governance['audited_financials'].notnull()]) governance['audited_financials'] = np.where(governance['FSAuditedInd'].notnull(), governance['FSAuditedInd'], governance['audited_financials'] ) print len(governance[governance['audited_financials'].notnull()]), '\n' governance['audited_financials'] = np.where( ((governance['audited_financials'].isnull()) & (governance['FSAudited'].notnull())), governance['FSAudited'], governance['audited_financials'] ) print len(governance[governance['audited_financials'].notnull()]), '\n' governance['audited_financials'].value_counts() # In[506]: governance['audited_financials'] = np.where( governance['audited_financials']=='true', 1, governance['audited_financials'] ) governance['audited_financials'] = np.where( governance['audited_financials']=='1', 1, governance['audited_financials'] ) governance['audited_financials'] = np.where( governance['audited_financials']=='false', 0, governance['audited_financials'] ) governance['audited_financials'] = np.where( governance['audited_financials']=='0', 0, governance['audited_financials'] ) print len(governance[governance['audited_financials'].notnull()]) governance['audited_financials'].value_counts() # ##### Federal Grant Audit Performed # In[507]: governance['federal_grant_audit_performed'] = governance['FederalGrantAuditPerformedInd'] print len(governance[governance['federal_grant_audit_performed'].isnull()]) governance['federal_grant_audit_performed'] = np.where(governance['federal_grant_audit_performed'].isnull(), governance['FederalGrantAuditPerformed'], governance['federal_grant_audit_performed']) print len(governance[governance['federal_grant_audit_performed'].isnull()]) print governance['federal_grant_audit_performed'].value_counts() # In[508]: governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='true', 1, governance['federal_grant_audit_performed'] ) governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='1', 1, governance['federal_grant_audit_performed'] ) governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='false', 0, governance['federal_grant_audit_performed'] ) governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='0', 0, governance['federal_grant_audit_performed'] ) print len(governance[governance['federal_grant_audit_performed'].notnull()]) governance['federal_grant_audit_performed'].value_counts() # In[509]: governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed'].isnull(), 0, governance['federal_grant_audit_performed']) governance['federal_grant_audit_performed'].value_counts() # ##### Federal Grant Audit Required # In[510]: governance['federal_grant_audit_required'] = governance['FederalGrantAuditRequiredInd'] print len(governance[governance['federal_grant_audit_required'].isnull()]) governance['federal_grant_audit_required'] = np.where(governance['federal_grant_audit_required'].isnull(), governance['FederalGrantAuditRequired'], governance['federal_grant_audit_required']) print len(governance[governance['federal_grant_audit_required'].isnull()]), '\n' print governance['federal_grant_audit_required'].value_counts(), '\n' governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required'].isnull(), 0, governance['federal_grant_audit_required']) governance['federal_grant_audit_required'].value_counts() # In[511]: governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='true', 1, governance['federal_grant_audit_required'] ) governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='1', 1, governance['federal_grant_audit_required'] ) governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='false', 0, governance['federal_grant_audit_required'] ) governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='0', 0, governance['federal_grant_audit_required'] ) print len(governance[governance['federal_grant_audit_required'].notnull()]) governance['federal_grant_audit_required'].value_counts() # ##### Independent Audited Financial Statements # In[512]: governance['independent_audited_fs'] = governance['IndependentAuditFinclStmtInd'] governance['independent_audited_fs'] = np.where(governance['independent_audited_fs'].isnull(), governance['IndependentAuditFinancialStmt'], governance['independent_audited_fs']) print len(governance[governance['independent_audited_fs'].isnull()]) print governance['independent_audited_fs'].value_counts() # In[517]: len(governance[governance['independent_audited_fs'].isnull()]) #
GIVEN ABOVE ERROR, I WILL HAVE TO EXTRACT THE RIGHT SUB-VARIABLE FROM THE COLUMN # - NOTE: WHEN CHECKING VARIABLE TYPE DO NOT PUT 'DICT' IN QUOTES # In[518]: from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING) import timeit start_time = timeit.default_timer() for index, row in governance[:10].iterrows(): #bond = 0 if type(row['independent_audited_fs'])==dict and '#text' in row['independent_audited_fs']: #print 'looking' governance.ix[index, 'independent_audited_fs'] = row['independent_audited_fs']['#text'] clear_output() print ('Index: ', index, 'Independent audited F/S:', governance.ix[index, 'independent_audited_fs']), '\n' sys.stdout.flush() elapsed = timeit.default_timer() - start_time print '# of minutes: ', elapsed/60, '\n', '\n' # In[519]: governance[:1] # In[520]: governance['independent_audited_fs'].value_counts() # In[521]: governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='true', 1, governance['independent_audited_fs'] ) governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='1', 1, governance['independent_audited_fs'] ) governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='false', 0, governance['independent_audited_fs'] ) governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='0', 0, governance['independent_audited_fs'] ) print len(governance[governance['independent_audited_fs'].notnull()]) governance['independent_audited_fs'].value_counts() # ##### Family Business Relations # In[522]: governance['no_relations'] = governance['FamilyOrBusinessRlnInd'] governance['no_relations'] = np.where(governance['no_relations'].isnull(), governance['FamilyOrBusinessRelationship'], governance['no_relations']) print len(governance[governance['no_relations'].isnull()]) print governance['no_relations'].value_counts() # In[523]: governance['no_relations'] = np.where( governance['no_relations']=='true', 1, governance['no_relations'] ) governance['no_relations'] = np.where( governance['no_relations']=='1', 1, governance['no_relations'] ) governance['no_relations'] = np.where( governance['no_relations']=='false', 0, governance['no_relations'] ) governance['no_relations'] = np.where( governance['no_relations']=='0', 0, governance['no_relations'] ) print len(governance[governance['no_relations'].notnull()]) governance['no_relations'].value_counts() # ##### Election of Board Members # In[525]: governance['elected_board'] = governance['ElectionOfBoardMembersInd'] governance['elected_board'] = np.where(governance['elected_board'].isnull(), governance['ElectionOfBoardMembers'], governance['elected_board']) print len(governance[governance['elected_board'].isnull()]) print governance['elected_board'].value_counts() # In[526]: governance['elected_board'] = np.where( governance['elected_board']=='true', 1, governance['elected_board'] ) governance['elected_board'] = np.where( governance['elected_board']=='1', 1, governance['elected_board'] ) governance['elected_board'] = np.where( governance['elected_board']=='false', 0, governance['elected_board'] ) governance['elected_board'] = np.where( governance['elected_board']=='0', 0, governance['elected_board'] ) print len(governance[governance['elected_board'].notnull()]) governance['elected_board'].value_counts() # ##### Government Grant # In[ ]: governance[['GovernmentGrantsAmt', ]] # In[537]: governance['government_grant'] = governance['GovernmentGrantsAmt'] governance['government_grant'] = np.where(governance['government_grant'].isnull(), governance['GovernmentGrants'], governance['government_grant']) print len(governance[governance['government_grant'].isnull()]) print governance['government_grant'].describe(), '\n' governance['government_grant'] = np.where(governance['government_grant'].isnull(), 0,governance['government_grant'] ) print len(governance[governance['government_grant'].isnull()]) print governance['government_grant'].describe(), '\n' governance['government_grant'] = np.where(governance['government_grant']>0, 1,0) print len(governance[governance['government_grant'].isnull()]) print governance['government_grant'].value_counts(), '\n' # ##### Zero FR Expenses # In[555]: governance2['zero_FR'] = np.nan print len(governance2[governance2['zero_FR'].notnull()]), '\n' governance2['zero_FR'] = np.where(governance2['TotalFundrsngExpCurrentYear'].notnull(), governance2['TotalFundrsngExpCurrentYear'], governance2['zero_FR'] ) print len(governance2[governance2['zero_FR'].notnull()]), '\n' governance2['zero_FR'] = np.where( ((governance2['zero_FR'].isnull()) & (governance2['CYTotalFundraisingExpenseAmt'].notnull())), governance2['CYTotalFundraisingExpenseAmt'], governance2['zero_FR'] ) print len(governance2[governance2['zero_FR'].notnull()]) governance2['zero_FR'].value_counts()[:5] # In[558]: governance2['zero_FR'] = np.where(governance2['zero_FR']=='0', 1,0) governance2['zero_FR'].value_counts() # ##### Save DF # In[539]: governance.to_pickle('governance variables including original columns (n=7,133).pkl') # ##### Create subset of columns in preparation for merge # In[540]: print governance.columns.tolist() # In[542]: governance_cols = ['EIN', 'FYE', 'independent_directors_num', 'independent_directors_pct', 'independent_directors_pct_v2', 'voting_directors', 'outsourced_mgt', '990_review', 'audit_committee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'restricted_donations', 'tax_exempt_bond', 'taxable_revenue', 'taxable_revenue_binary', 'taxable_revenue_neg_or_pos_rev_binary', 'audited_financials', 'independent_audited_fs', 'federal_grant_audit_performed', 'federal_grant_audit_required', 'FeesForServicesAccounting_binary', 'no_relations', 'elected_board', 'government_grant', ] # ### Merge with Model 4 Data # In[394]: df = pd.read_pickle('2016 - Test 4 data.pkl') print "Number of columns:", len(df.columns) print "Number of observations:", len(df) df.head(1) # In[543]: df_test4 = pd.read_stata('Test 4 data.dta') print "Number of columns:", len(df_test4.columns) print "Number of observations:", len(df_test4) df_test4.head(1) # In[544]: print set(df.columns.tolist()) - set(df_test4.columns.tolist()), '\n' print set(df_test4.columns.tolist()) - set(df.columns.tolist()) # In[405]: #print len(df.columns) #print len(pd.merge(df, governance[governance_cols], left_on='EIN', right_on='EIN', how='left', indicator=True)), '\n' #merged = pd.merge(df, governance[governance_cols], left_on='EIN', right_on='EIN', how='left', indicator=True) #print len(merged.columns) #print len(merged), '\n' #print merged['_merge'].value_counts() # In[545]: print len(df.columns) print len(pd.merge(df_test4, governance[governance_cols], left_on='ein', right_on='EIN', how='left', indicator=True)), '\n' merged = pd.merge(df_test4, governance[governance_cols], left_on='ein', right_on='EIN', how='left', indicator=True) print len(merged.columns) print len(merged), '\n' print merged['_merge'].value_counts() # In[560]: #merged = merged.drop('_merge', 1) # In[562]: ''' print len(merged.columns) print len(pd.merge(merged, governance2[['EIN', 'zero_FR']], left_on='ein', right_on='EIN', how='left', indicator=True)), '\n' merged = pd.merge(merged, governance2[['EIN', 'zero_FR']], left_on='ein', right_on='EIN', how='left', indicator=True) print len(merged.columns) print len(merged), '\n' print merged['_merge'].value_counts() ''' # In[563]: merged[:1] # In[416]: #merged.rename(columns={'FYE_x':'FYE'}, inplace=True) # In[418]: pd.set_option('display.float_format', lambda x: '%.2f' % x) # ##### Descriptives for our governance variables # In[439]: merged[:1] # ##### Save DF # In[565]: merged.to_pickle('model 4 data with e-file governance variables (n=8,238).pkl') # In[566]: merged.to_excel('model 4 data with e-file governance variables (n=8,238).xls')