#!/usr/bin/env python
# coding: utf-8
# ### Load Packages
# In[440]:
3522 + 35 + 1032 + 62 + 206
# In[1]:
import sys
import time
import json
# In[2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3 as lite
from pandas import DataFrame
from pandas import Series
# In[3]:
#http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 250)
# In[92]:
from __future__ import division
# #### Set working directory
# In[4]:
cd '/Users/gregorysaxton/Google Drive/SOX'
# ### Create List of EINs for '2016' charities -- I'll use this to create a list of EINs for matching with e-file database
# - Previous (Read in Saved List of EINs with 'current' (2016) donor advisories)
# - What I'm doing here is reading in the Stata 'Table 5' dataset so that I can create a list of the 8,304 relevant EINs.
# In[7]:
#f = open('2016 donor advisory EINs.json', 'r')
#advisories_2016 = json.load(f)
#print len(advisories_2016)
#print advisories_2016[:10]
# In[6]:
#df = pd.read_pickle('2016 - Test 4 data.pkl')
#print "Number of columns:", len(df.columns)
#print "Number of observations:", len(df)
#df.head(1)
# In[5]:
#print len(set(df['EIN'].tolist()))
#eins_2016 = df['EIN'].tolist()
#print len(eins_2016), len(set(eins_2016))
#print eins_2016[:5]
# #### Read in Test 4 data
# In[393]:
df = pd.read_pickle('2016 - Test 4 data.pkl')
print "Number of columns:", len(df.columns)
print "Number of observations:", len(df)
df.head(1)
# In[23]:
print len(set(df['EIN'].tolist()))
eins_2016 = df['EIN'].tolist()
print len(eins_2016), len(set(eins_2016))
print eins_2016[:5]
# In[ ]:
print len(eins_2016)
with open('eins_2016.json', 'w') as outfile:
json.dump(eins_2016, outfile)
# In[ ]:
#import json
#with open('valid_next_v8.json', 'r') as fp:
# valid_next = json.load(fp)
#print len(valid_next)
#print valid_next[:5]
# # Read DB into PANDAS DF
# - I'm taking all 29,440 990 e-filings for our 8,304 EINs and importing them from MongoDB into a Python PANDAS dataframe for manipulation.
# - As you can see if you scroll horizontally through this sample row, there are 691 columns!
# In[8]:
df_all = pd.read_pickle('e-file data for 7,133 of 8,304 2016 EINs.pkl')
print '# of columns:', len(df_all.columns)
print '# of observations:', len(df_all)
df_all.head(1)
# ## Identify Governance Variables
# In[17]:
vars = pd.read_pickle('variable descriptions 2010-2015 990 e-file data (first year only).pkl')
print '# of columns:', len(vars.columns)
print '# of observations:', len(vars)
vars.head(1)
# ##### Independent and voting directors
# In[37]:
cols_2013 = ['GoverningBodyVotingMembersCnt', 'IndependentVotingMemberCnt',
'VotingMembersGoverningBodyCnt', 'VotingMembersIndependentCnt']
cols_2010 = ['NbrIndependentVotingMembers', 'NbrVotingGoverningBodyMembers',
'NbrVotingMembersGoverningBody', 'NumberIndependentVotingMembers']
df_all[['FYE', 'EIN'] + cols_2010+cols_2013][:20]
# In[43]:
print len(df_all[(df_all['IndependentVotingMemberCnt'].notnull())])
print len(df_all[(df_all['VotingMembersIndependentCnt'].notnull())])
print len(df_all[(df_all['IndependentVotingMemberCnt'].isnull()) & (df_all['VotingMembersIndependentCnt'].notnull())])
print len(df_all[(df_all['VotingMembersIndependentCnt'].isnull()) & (df_all['IndependentVotingMemberCnt'].notnull())]), '\n'
print len(df_all[(df_all['GoverningBodyVotingMembersCnt'].isnull()) & (df_all['VotingMembersGoverningBodyCnt'].notnull())])
print len(df_all[(df_all['VotingMembersGoverningBodyCnt'].isnull()) & (df_all['GoverningBodyVotingMembersCnt'].notnull())])
#
Based on the above we can drop the first two for each year -- we'll go with the Section I, Line 3 and 4 versions, which seem to be the same
# In[242]:
cols = ['EIN', 'FYE']
#cols_2013 = ['VotingMembersGoverningBodyCnt', 'VotingMembersIndependentCnt']
#cols_2010 = ['NbrIndependentVotingMembers', 'NbrVotingMembersGoverningBody']
cols = cols + ['VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody',
'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers']
print cols
# ##### Outsourced Management
# In[243]:
cols = cols + ['DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties']
print cols
# ##### IRS 990 Review
# In[244]:
cols = cols + ['Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody']
print cols
# ##### Audit Committee
# In[245]:
cols = cols + ['AuditCommitteeInd', 'AuditCommittee']
print cols
# ##### Donor Restrictions
# In[54]:
7351259+4433997+11904841
#
What we see here is that *TemporarilyRstrNetAssetsGrp* + *PermanentlyRstrNetAssetsGrp* + *UnrestrictedNetAssetsGrp* = *TotalNetAssetsFundBalanceGrp* = *NetAssetsOrFundBalancesEOYAmt*
#
# - Given that *TotalNetAssetsFundBalanceGrp* is a 'group' variable, I should gather EOY values from the first two variables, take the sum, and then divide by *NetAssetsOrFundBalancesEOYAmt* in order to get a *DONOR RESTRICTIONS* VARIABLE SIMILAR TO YETMAN & YETMAN
# In[53]:
asset_cols = ['TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']
df_all[['EIN', 'FYE']+asset_cols][:10]
# In[62]:
#cols = cols + ['TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
#'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
#'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
#'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']
#print cols
# In[247]:
print df_all.columns.tolist()[-20:]
# In[249]:
cols = cols + ['perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions']
# ##### Tax-Exempt Bonds (municipal bonds)
# In[250]:
cols = cols + ['TaxExemptBondsInd', 'TaxExemptBonds']
print cols
# ##### Taxable Revenues
# In[251]:
cols = cols + ['NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome']
print cols
# ##### Audited Financials and Federal Audits
# In[69]:
#df_all['AuditedFinancialStmtAttInd'].value_counts()
# In[76]:
audit_cols = [#'AuditedFinancialStmtAttInd',
#'ConsolidatedAuditFinclStmtInd', 'ConsolidatedAuditFinancialStmt',
'FSAuditedInd', 'FSAudited', #'FSAuditedBasisGrp', 'FSAuditedBasis',
'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed',
'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired',
'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', ]
df_all[['EIN', 'FYE']+ audit_cols][:8]
# In[252]:
cols = cols + ['FSAuditedInd', 'FSAudited',
'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed',
'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired',
'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', ]
print cols
# #### Create *Donor Restrictions* variable as in Yetman & Yetman (2012)
# I NEED TO DO IT *BEFORE* COLLAPSING -- OTHERWISE IT WILL CAUSE PROBLEMS FOR AN EIN THAT HAS NO PERMANENTLY RESTRICTED ASSETS IN ONE YEAR, SAY, BUT IT DOES THE NEXT -- THE GROUPBY FUNCTION WILL BRING THOSE INTO THE SAME ROW.
# ##### Donor Restrictions -- Part (a) - Permanently Restricted Assets
# In[199]:
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in df_all[:].iterrows():
perm_rest = 0
if pd.notnull(row['PermanentlyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['PermanentlyRestrictedNetAssets']):
print 'going to second variable'
if 'EOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['EOY']
elif 'BOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['BOY']
df_all.ix[index, 'perm_rest_assets'] = perm_rest
clear_output()
print ('Index: ', index, 'Perm. Restr Assets:', perm_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print df_all['perm_rest_assets'].value_counts().sum()
# In[200]:
print len(df_all[df_all['perm_rest_assets'].notnull()])
print len(df_all[df_all['perm_rest_assets'].isnull()])
print df_all['perm_rest_assets'].describe(), '\n'
df_all['perm_rest_assets'] = df_all['perm_rest_assets'].astype('int')
print df_all['perm_rest_assets'].describe()
# ##### Donor Restrictions -- Part (b) - Temporarily Restricted Assets
# In[201]:
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in df_all[:].iterrows():
temp_rest = 0
if pd.notnull(row['TemporarilyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['TemporarilyRestrictedNetAssets']):
print 'going to second'
if 'EOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['EOY']
elif 'BOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['BOY']
df_all.ix[index, 'temp_rest_assets'] = temp_rest
clear_output()
print ('Index: ', index, 'Temp. Restr Assets:', temp_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print df_all['temp_rest_assets'].value_counts().sum()
# In[202]:
print len(df_all[df_all['temp_rest_assets'].notnull()])
print len(df_all[df_all['temp_rest_assets'].isnull()])
print df_all['temp_rest_assets'].describe(), '\n'
df_all['temp_rest_assets'] = df_all['temp_rest_assets'].astype('int')
print df_all['temp_rest_assets'].describe()
# ##### Donor Restrictions -- Part (c) - Total Assets
# In[203]:
df_all['net_assets'] = df_all['NetAssetsOrFundBalancesEOYAmt']
df_all['net_assets'] = np.where(df_all['net_assets'].isnull(),
df_all['NetAssetsOrFundBalancesEOY'], df_all['net_assets'])
print df_all['net_assets'].describe(), '\n'
df_all['net_assets'] = df_all['net_assets'].astype('int')
print df_all['net_assets'].describe()
# ##### Donor Restrictions -- Part (d) - Calculations
# In[204]:
df_all['donor_restrictions'] = (df_all['temp_rest_assets']+df_all['perm_rest_assets'])/df_all['net_assets']
df_all['donor_restrictions'].describe()
# In[205]:
df_all.dtypes[-5:]
#
THERE ARE 40 MISSING CASES -- ASSIGN THEM A VALUE OF ZERO
# In[207]:
print len(df_all[df_all['donor_restrictions'].isnull()])
# In[208]:
df_all['donor_restrictions'] = np.where(df_all['donor_restrictions'].isnull(), 0, df_all['donor_restrictions'])
df_all['donor_restrictions'].describe()
# In[209]:
print len(df_all[df_all['donor_restrictions'].isnull()])
#
CHECK A FEW >1 ROWS
# In[221]:
df_all[['FYE', 'donor_restrictions', 'temp_rest_assets', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'perm_rest_assets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'net_assets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY',
#'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
#'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']][-6:]
# In[211]:
df_all[['FYE', 'donor_restrictions', 'temp_rest_assets', 'perm_rest_assets', 'net_assets']][-6:]
# In[224]:
print len(df_all[df_all['donor_restrictions']>.99])
print len(df_all[df_all['donor_restrictions']>1])
df_all[df_all['donor_restrictions']>1][['FYE', 'donor_restrictions', 'temp_rest_assets', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'perm_rest_assets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'net_assets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY',
#'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
#'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']][-6:]
# ##### Fees for Services (proxy for audit)
# In[228]:
df_all['FeesForServicesAccountingGrp_v2'] = np.where(df_all['FeesForServicesAccountingGrp'].notnull(),
df_all['FeesForServicesAccountingGrp'],
df_all['FeesForServicesAccounting'])
print len(df_all[df_all['FeesForServicesAccountingGrp_v2'].isnull()]), '\n'
print len(df_all[df_all['FeesForServicesAccountingGrp_v2'].notnull()]), '\n'
# In[229]:
for index, row in df_all[df_all['FeesForServicesAccountingGrp_v2'].notnull()][:].iterrows():
#print type(row['FeesForServicesAccountingGrp_v2']), row['FeesForServicesAccountingGrp_v2']
if 'Total' in row['FeesForServicesAccountingGrp_v2']:
#print 'THERE IS "Total"', row['FeesForServicesAccountingGrp_v2']['Total']
df_all.ix[index, 'FeesForServicesAccounting_TotalAmt'] = row['FeesForServicesAccountingGrp_v2']['Total']
if 'TotalAmt' in row['FeesForServicesAccountingGrp_v2']:
#print 'THERE IS "TotalAmt"', row['FeesForServicesAccountingGrp_v2']['TotalAmt']
df_all.ix[index, 'FeesForServicesAccounting_TotalAmt'] = row['FeesForServicesAccountingGrp_v2']['TotalAmt']
clear_output()
print ('Index: ', index, 'Fees:', df_all.ix[index, 'FeesForServicesAccounting_TotalAmt']), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
df_all['FeesForServicesAccounting_TotalAmt'].value_counts().sum()
# In[230]:
df_all['FeesForServicesAccounting_TotalAmt'] = df_all['FeesForServicesAccounting_TotalAmt'].astype('float')
# In[231]:
df_all['FeesForServicesAccounting_binary'] = np.where(df_all['FeesForServicesAccounting_TotalAmt']>0, 1,0)
df_all['FeesForServicesAccounting_binary'].value_counts()
# In[232]:
print '# of rows where fees less than 0:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']<0]), '\n'
print '# of rows where fees equal zero:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']==0]), '\n'
print '# of rows where fees greater than zero:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']>0]), '\n'
print '# of rows missing values:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt'].isnull()]), '\n'
# In[233]:
print '# of rows that should have value of zero:', 4+1274+3723
# In[236]:
df_all[df_all['FeesForServicesAccounting_TotalAmt']<0][:][['FeesForServicesAccounting_TotalAmt']]
# In[237]:
df_all['FeesForServicesAccounting_binary'].value_counts()
# In[254]:
cols = cols + ['FeesForServicesAccounting_binary']
print cols
# In[287]:
print len(df_all[df_all['donor_restrictions']>100])
df_all[df_all['donor_restrictions']>100][cols]
# ##### Family Business Relations
# In[443]:
cols = cols + ['FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship']
print cols
# ##### Tax Exempt Bonds, Take II
# In[444]:
cols = cols + ['TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities']
print cols
# ##### Government Grants
# In[445]:
cols = cols + ['GovernmentGrantsAmt', 'GovernmentGrants']
print cols
# ##### Election of Board Members
# In[446]:
cols = cols + ['ElectionOfBoardMembersInd', 'ElectionOfBoardMembers']
print cols
# ##### Zero Fundraising
# In[551]:
cols = cols + ['CYTotalFundraisingExpenseAmt', 'TotalFundrsngExpCurrentYear']
print cols
# ##### Save DF
# In[255]:
len(df_all)
# In[256]:
df_all.to_pickle('e-file data for 7,133 of 8,304 2016 EINs (n=24940).pkl')
# ### Collapse Dataset -- take last
# In[447]:
df_all = df_all.sort_values(by=['EIN', 'FYE'], ascending=[1,0])
df_all[:5][cols]
# In[259]:
df_all[-3:][cols]
# In[552]:
print len(df_all[cols].groupby('EIN').agg('first'))
governance = df_all[cols].groupby('EIN').agg('first')
print len(governance)
governance = governance.reset_index()
governance[:5]
# In[449]:
governance['FYE'].value_counts()
# ### Create combined variables
# In[262]:
print cols
# ##### Independent Directors
# In[450]:
governance['independent_directors_num'] = governance['VotingMembersIndependentCnt']
governance['independent_directors_num'] = np.where(governance['independent_directors_num'].isnull(),
governance['NbrIndependentVotingMembers'], governance['independent_directors_num'])
print len(governance[governance['independent_directors_num'].isnull()])
#print governance['independent_directors_num'].value_counts()
governance['independent_directors_num'] = governance['independent_directors_num'].astype('int')
governance['independent_directors_num'].describe()
# In[451]:
governance[governance['VotingMembersIndependentCnt'].isnull()][:2]
# ##### Voting Directors
# In[452]:
governance['voting_directors'] = governance['VotingMembersGoverningBodyCnt']
governance['voting_directors'] = np.where(governance['voting_directors'].isnull(),
governance['NbrVotingMembersGoverningBody'], governance['voting_directors'])
print len(governance[governance['voting_directors'].isnull()])
#print governance['voting_directors'].value_counts()
governance['voting_directors'] = governance['voting_directors'].astype('int')
# ##### Percent Independent Directors
# In[457]:
governance['independent_directors_pct'] = governance['independent_directors_num']/governance['voting_directors']
governance['independent_directors_pct'].describe()
# In[456]:
governance[governance['independent_directors_pct'].isnull()]
# In[458]:
governance['independent_directors_pct'] = np.where(governance['independent_directors_pct'].isnull(), 0,
governance['independent_directors_pct'])
governance['independent_directors_pct'].describe()
#
Second version -- none over 100%
# In[461]:
print len(governance[governance['independent_directors_pct']>1])
governance['independent_directors_pct_v2'] = np.where(governance['independent_directors_pct']>1, 1,
governance['independent_directors_pct'])
print len(governance[governance['independent_directors_pct_v2']>1])
print governance['independent_directors_pct_v2'].describe()
# ##### Outsourced Management
# In[463]:
governance['outsourced_mgt'] = governance['DelegationOfMgmtDutiesInd']
governance['outsourced_mgt'] = np.where(governance['outsourced_mgt'].isnull(),
governance['DelegationOfManagementDuties'], governance['outsourced_mgt'])
print len(governance[governance['outsourced_mgt'].isnull()])
print governance['outsourced_mgt'].value_counts()
# In[464]:
governance['outsourced_mgt'] = np.where(governance['outsourced_mgt']=='true', 1, governance['outsourced_mgt'])
governance['outsourced_mgt'] = np.where(governance['outsourced_mgt']=='false', 0, governance['outsourced_mgt'])
print governance['outsourced_mgt'].value_counts()
# In[465]:
governance['outsourced_mgt']=governance['outsourced_mgt'].astype('int')
print governance['outsourced_mgt'].value_counts()
# ##### IRS 990 Review
# In[466]:
governance['990_review'] = governance['Form990ProvidedToGvrnBodyInd']
governance['990_review'] = np.where(governance['990_review'].isnull(),
governance['Form990ProvidedToGoverningBody'], governance['990_review'])
print len(governance[governance['990_review'].isnull()])
print governance['990_review'].value_counts()
# In[467]:
governance['990_review'] = np.where(governance['990_review']=='true', 1, governance['990_review'])
governance['990_review'] = np.where(governance['990_review']=='false', 0, governance['990_review'])
governance['990_review'] = governance['990_review'].astype('int')
print governance['990_review'].value_counts()
# ##### Audit Committee
# I'M ASSIGNING A VALUE OF '0' TO THE 138 CASES MISSING VALUES
# In[468]:
governance['audit_committee'] = governance['AuditCommitteeInd']
governance['audit_committee'] = np.where(governance['audit_committee'].isnull(),
governance['AuditCommittee'], governance['audit_committee'])
print len(governance[governance['audit_committee'].isnull()])
print governance['audit_committee'].value_counts()
# In[469]:
governance['audit_committee'] = np.where(governance['audit_committee']=='true', 1, governance['audit_committee'])
governance['audit_committee'] = np.where(governance['audit_committee']=='false', 0, governance['audit_committee'])
governance['audit_committee'] = np.where(governance['audit_committee'].isnull(), 0, governance['audit_committee'])
governance['audit_committee']=governance['audit_committee'].astype('int')
print governance['audit_committee'].value_counts()
# In[470]:
governance.describe().T
# ### Donor Restrictions
# - NOTE THAT I'M NOT DOING IT THIS WAY ANY MORE -- NOW IT IS DONE ABOVE IN *df_all* BEFORE COLLAPSING THE DATA
# - ALSO NOTE THAT THE 990 DATA MUST HAVE SOME ERRORS BECAUSE THERE ARE A FEW CASES WITH VERY HIGH VALUES ON *DONOR_RESTRICTIONS*
# In[471]:
#print governance['donor_restrictions'].min()
#governance['donor_restrictions'].max()
# In[473]:
print len(governance[governance['donor_restrictions']>100])
governance[governance['donor_restrictions']>100]
# In[472]:
print len(governance[governance['donor_restrictions']>10])
# In[290]:
governance[governance['donor_restrictions']>10]
# In[474]:
governance['donor_restrictions'] = np.where(governance['donor_restrictions']>100, 0, governance['donor_restrictions'])
governance['donor_restrictions'].describe()
# ##### Donor Restrictions (binary) - as in Harris et al.
# In[476]:
governance[:1]
# In[479]:
governance['restricted_donations'] = np.where(((governance['perm_rest_assets']>1)|
(governance['temp_rest_assets']>1)), 1,0)
governance['restricted_donations'].value_counts()
# In[491]:
governance[['perm_rest_assets', 'temp_rest_assets', 'restricted_donations']][20:25]
# ##### Donor Restrictions -- Part (a) - Temporarily Restricted Assets
# In[123]:
'''
print len(governance[governance['TemporarilyRstrNetAssetsGrp'].isnull()])
#, 'TemporarilyRestrictedNetAssets',
#'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
#'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY',
'''
# In[291]:
#governance[267:270]
# In[140]:
'''
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:].iterrows():
if pd.notnull(row['TemporarilyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['TemporarilyRestrictedNetAssets']):
print 'going to second'
if 'EOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['EOY']
elif 'BOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['BOY']
governance.ix[index, 'temp_rest_assets'] = temp_rest
clear_output()
print ('Index: ', index, 'Temp. Restr Assets:', temp_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print governance['temp_rest_assets'].value_counts().sum()
'''
# In[146]:
'''
print len(governance[governance['temp_rest_assets'].notnull()])
print len(governance[governance['temp_rest_assets'].isnull()])
print governance['temp_rest_assets'].describe()
governance['temp_rest_assets'] = governance['temp_rest_assets'].astype('int')
print governance['temp_rest_assets'].describe()
'''
# ##### Donor Restrictions -- Part (b) - Permanently Restricted Assets
# In[292]:
#governance[:2]
# In[149]:
'''
print len(governance[governance['PermanentlyRestrictedNetAssets'].notnull()])
print len(governance[governance['PermanentlyRestrictedNetAssets'].isnull()])
print len(governance[governance['PermanentlyRstrNetAssetsGrp'].notnull()])
print len(governance[governance['PermanentlyRstrNetAssetsGrp'].isnull()])
'''
# In[151]:
'''
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:].iterrows():
if pd.notnull(row['PermanentlyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['PermanentlyRestrictedNetAssets']):
print 'going to second variable'
if 'EOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['EOY']
elif 'BOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['BOY']
governance.ix[index, 'perm_rest_assets'] = perm_rest
clear_output()
print ('Index: ', index, 'Perm. Restr Assets:', perm_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print governance['perm_rest_assets'].value_counts().sum()
'''
# In[153]:
'''
print len(governance[governance['perm_rest_assets'].notnull()])
print len(governance[governance['perm_rest_assets'].isnull()])
print governance['perm_rest_assets'].describe(), '\n'
governance['perm_rest_assets'] = governance['perm_rest_assets'].astype('int')
print governance['perm_rest_assets'].describe()
'''
# ##### Donor Restrictions -- Part (c) - Total Assets
# In[157]:
'''
governance['net_assets'] = governance['NetAssetsOrFundBalancesEOYAmt']
governance['net_assets'] = np.where(governance['net_assets'].isnull(),
governance['NetAssetsOrFundBalancesEOY'], governance['net_assets'])
print governance['net_assets'].describe(), '\n'
governance['net_assets'] = governance['net_assets'].astype('int')
print governance['net_assets'].describe()
'''
# ##### Donor Restrictions -- Part (d) - Calculations
# In[159]:
'''
governance['donor_restrictions'] = (governance['temp_rest_assets']+governance['perm_rest_assets'])/governance['net_assets']
governance['donor_restrictions'].describe()
'''
# In[162]:
'''
governance['donor_restrictions'] = np.where(governance['donor_restrictions'].isnull(), 0, governance['donor_restrictions'])
governance['donor_restrictions'].describe()
'''
# In[169]:
'''
df_all[df_all['EIN']=='010287624'][['FYE', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']]
'''
# In[171]:
#(718875+0)/(718875+1112268)
# In[174]:
#print (718875+1112268)-1831143
#print 1831143 - (718875+1112268)
# ##### Tax-Exempt Bond
# In[492]:
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:].iterrows():
bond = 0
if pd.notnull(row['TaxExemptBondsInd']):
#print 'not null!'
if '#text' in row['TaxExemptBondsInd']:
bond = row['TaxExemptBondsInd']['#text']
else:
bond = row['TaxExemptBondsInd']
elif pd.notnull(row['TaxExemptBonds']):
if '#text' in row['TaxExemptBonds']:
bond = row['TaxExemptBonds']['#text']
else:
bond = row['TaxExemptBonds']
governance.ix[index, 'tax_exempt_bond'] = bond
clear_output()
print ('Index: ', index, 'Tax-exempt bond:', bond), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print governance['tax_exempt_bond'].value_counts().sum()
# In[493]:
print governance['tax_exempt_bond'].value_counts(), '\n'
governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond']=='true', 1, governance['tax_exempt_bond'])
governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond']=='false', 0, governance['tax_exempt_bond'])
governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond'].isnull(), 0, governance['tax_exempt_bond'])
governance['tax_exempt_bond'] = governance['tax_exempt_bond'].astype('int')
print governance['tax_exempt_bond'].value_counts()
# ##### Tax-Exempt Bonds, v2 (Harris et al.)
# NO NEED TO DO THIS -- MY VARIABLE ALREADY TAPS IT
# In[501]:
governance[['TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities', 'tax_exempt_bond']][2:4]
# ##### Taxable Revenues
# AS IN YETMAN AND YETMAN'S *TAXABLE REVENUE* VARIABLE -- A REGULATORY OVERSIGHT VARIABLE, GIVEN THAT IT IS RELATED TO AUDIT LIKELIHOOD.
# In[502]:
governance['taxable_revenue'] = governance['NetUnrelatedBusTxblIncmAmt']
governance['taxable_revenue'] = np.where(governance['taxable_revenue'].isnull(),
governance['NetUnrelatedBusinessTxblIncome'], governance['taxable_revenue'])
print len(governance[governance['taxable_revenue'].isnull()])
print governance['taxable_revenue'].describe(), '\n'
governance['taxable_revenue'] = np.where(governance['taxable_revenue'].isnull(), 0, governance['taxable_revenue'])
print governance['taxable_revenue'].describe(), '\n'
governance['taxable_revenue'] = governance['taxable_revenue'].astype('int')
print governance['taxable_revenue'].describe(), '\n'
#
CREATE TWO BINARY VERSION -- ONE FOR TAXABLE REVENUE GREATER THAN ZERO AND ONE FOR TAXABLE REVENUE OTHER THAN ZERO
# In[503]:
print len(governance[governance['taxable_revenue']>0])
print len(governance[governance['taxable_revenue']==0])
print len(governance[governance['taxable_revenue']<0])
governance['taxable_revenue_binary'] = np.where(governance['taxable_revenue']>0, 1,0)
print governance['taxable_revenue_binary'].value_counts(), '\n'
governance['taxable_revenue_neg_or_pos_rev_binary'] = np.where( ((governance['taxable_revenue']>0) |
(governance['taxable_revenue']<0)), 1,0)
print governance['taxable_revenue_neg_or_pos_rev_binary'].value_counts()
# In[504]:
governance[:1]
# ##### Audited Financials
# In[505]:
governance['audited_financials'] = np.nan
print len(governance[governance['audited_financials'].notnull()])
governance['audited_financials'] = np.where(governance['FSAuditedInd'].notnull(), governance['FSAuditedInd'],
governance['audited_financials'] )
print len(governance[governance['audited_financials'].notnull()]), '\n'
governance['audited_financials'] = np.where( ((governance['audited_financials'].isnull()) &
(governance['FSAudited'].notnull())),
governance['FSAudited'], governance['audited_financials'] )
print len(governance[governance['audited_financials'].notnull()]), '\n'
governance['audited_financials'].value_counts()
# In[506]:
governance['audited_financials'] = np.where( governance['audited_financials']=='true', 1, governance['audited_financials'] )
governance['audited_financials'] = np.where( governance['audited_financials']=='1', 1, governance['audited_financials'] )
governance['audited_financials'] = np.where( governance['audited_financials']=='false', 0, governance['audited_financials'] )
governance['audited_financials'] = np.where( governance['audited_financials']=='0', 0, governance['audited_financials'] )
print len(governance[governance['audited_financials'].notnull()])
governance['audited_financials'].value_counts()
# ##### Federal Grant Audit Performed
# In[507]:
governance['federal_grant_audit_performed'] = governance['FederalGrantAuditPerformedInd']
print len(governance[governance['federal_grant_audit_performed'].isnull()])
governance['federal_grant_audit_performed'] = np.where(governance['federal_grant_audit_performed'].isnull(),
governance['FederalGrantAuditPerformed'], governance['federal_grant_audit_performed'])
print len(governance[governance['federal_grant_audit_performed'].isnull()])
print governance['federal_grant_audit_performed'].value_counts()
# In[508]:
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='true', 1, governance['federal_grant_audit_performed'] )
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='1', 1, governance['federal_grant_audit_performed'] )
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='false', 0, governance['federal_grant_audit_performed'] )
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='0', 0, governance['federal_grant_audit_performed'] )
print len(governance[governance['federal_grant_audit_performed'].notnull()])
governance['federal_grant_audit_performed'].value_counts()
# In[509]:
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed'].isnull(),
0, governance['federal_grant_audit_performed'])
governance['federal_grant_audit_performed'].value_counts()
# ##### Federal Grant Audit Required
# In[510]:
governance['federal_grant_audit_required'] = governance['FederalGrantAuditRequiredInd']
print len(governance[governance['federal_grant_audit_required'].isnull()])
governance['federal_grant_audit_required'] = np.where(governance['federal_grant_audit_required'].isnull(),
governance['FederalGrantAuditRequired'], governance['federal_grant_audit_required'])
print len(governance[governance['federal_grant_audit_required'].isnull()]), '\n'
print governance['federal_grant_audit_required'].value_counts(), '\n'
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required'].isnull(),
0, governance['federal_grant_audit_required'])
governance['federal_grant_audit_required'].value_counts()
# In[511]:
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='true', 1, governance['federal_grant_audit_required'] )
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='1', 1, governance['federal_grant_audit_required'] )
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='false', 0, governance['federal_grant_audit_required'] )
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='0', 0, governance['federal_grant_audit_required'] )
print len(governance[governance['federal_grant_audit_required'].notnull()])
governance['federal_grant_audit_required'].value_counts()
# ##### Independent Audited Financial Statements
# In[512]:
governance['independent_audited_fs'] = governance['IndependentAuditFinclStmtInd']
governance['independent_audited_fs'] = np.where(governance['independent_audited_fs'].isnull(),
governance['IndependentAuditFinancialStmt'], governance['independent_audited_fs'])
print len(governance[governance['independent_audited_fs'].isnull()])
print governance['independent_audited_fs'].value_counts()
# In[517]:
len(governance[governance['independent_audited_fs'].isnull()])
#
GIVEN ABOVE ERROR, I WILL HAVE TO EXTRACT THE RIGHT SUB-VARIABLE FROM THE COLUMN
# - NOTE: WHEN CHECKING VARIABLE TYPE DO NOT PUT 'DICT' IN QUOTES
# In[518]:
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:10].iterrows():
#bond = 0
if type(row['independent_audited_fs'])==dict and '#text' in row['independent_audited_fs']:
#print 'looking'
governance.ix[index, 'independent_audited_fs'] = row['independent_audited_fs']['#text']
clear_output()
print ('Index: ', index, 'Independent audited F/S:', governance.ix[index, 'independent_audited_fs']), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
# In[519]:
governance[:1]
# In[520]:
governance['independent_audited_fs'].value_counts()
# In[521]:
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='true', 1, governance['independent_audited_fs'] )
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='1', 1, governance['independent_audited_fs'] )
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='false', 0, governance['independent_audited_fs'] )
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='0', 0, governance['independent_audited_fs'] )
print len(governance[governance['independent_audited_fs'].notnull()])
governance['independent_audited_fs'].value_counts()
# ##### Family Business Relations
# In[522]:
governance['no_relations'] = governance['FamilyOrBusinessRlnInd']
governance['no_relations'] = np.where(governance['no_relations'].isnull(),
governance['FamilyOrBusinessRelationship'], governance['no_relations'])
print len(governance[governance['no_relations'].isnull()])
print governance['no_relations'].value_counts()
# In[523]:
governance['no_relations'] = np.where( governance['no_relations']=='true', 1, governance['no_relations'] )
governance['no_relations'] = np.where( governance['no_relations']=='1', 1, governance['no_relations'] )
governance['no_relations'] = np.where( governance['no_relations']=='false', 0, governance['no_relations'] )
governance['no_relations'] = np.where( governance['no_relations']=='0', 0, governance['no_relations'] )
print len(governance[governance['no_relations'].notnull()])
governance['no_relations'].value_counts()
# ##### Election of Board Members
# In[525]:
governance['elected_board'] = governance['ElectionOfBoardMembersInd']
governance['elected_board'] = np.where(governance['elected_board'].isnull(),
governance['ElectionOfBoardMembers'], governance['elected_board'])
print len(governance[governance['elected_board'].isnull()])
print governance['elected_board'].value_counts()
# In[526]:
governance['elected_board'] = np.where( governance['elected_board']=='true', 1, governance['elected_board'] )
governance['elected_board'] = np.where( governance['elected_board']=='1', 1, governance['elected_board'] )
governance['elected_board'] = np.where( governance['elected_board']=='false', 0, governance['elected_board'] )
governance['elected_board'] = np.where( governance['elected_board']=='0', 0, governance['elected_board'] )
print len(governance[governance['elected_board'].notnull()])
governance['elected_board'].value_counts()
# ##### Government Grant
# In[ ]:
governance[['GovernmentGrantsAmt', ]]
# In[537]:
governance['government_grant'] = governance['GovernmentGrantsAmt']
governance['government_grant'] = np.where(governance['government_grant'].isnull(),
governance['GovernmentGrants'], governance['government_grant'])
print len(governance[governance['government_grant'].isnull()])
print governance['government_grant'].describe(), '\n'
governance['government_grant'] = np.where(governance['government_grant'].isnull(), 0,governance['government_grant'] )
print len(governance[governance['government_grant'].isnull()])
print governance['government_grant'].describe(), '\n'
governance['government_grant'] = np.where(governance['government_grant']>0, 1,0)
print len(governance[governance['government_grant'].isnull()])
print governance['government_grant'].value_counts(), '\n'
# ##### Zero FR Expenses
# In[555]:
governance2['zero_FR'] = np.nan
print len(governance2[governance2['zero_FR'].notnull()]), '\n'
governance2['zero_FR'] = np.where(governance2['TotalFundrsngExpCurrentYear'].notnull(),
governance2['TotalFundrsngExpCurrentYear'],
governance2['zero_FR'] )
print len(governance2[governance2['zero_FR'].notnull()]), '\n'
governance2['zero_FR'] = np.where( ((governance2['zero_FR'].isnull()) &
(governance2['CYTotalFundraisingExpenseAmt'].notnull())),
governance2['CYTotalFundraisingExpenseAmt'], governance2['zero_FR'] )
print len(governance2[governance2['zero_FR'].notnull()])
governance2['zero_FR'].value_counts()[:5]
# In[558]:
governance2['zero_FR'] = np.where(governance2['zero_FR']=='0', 1,0)
governance2['zero_FR'].value_counts()
# ##### Save DF
# In[539]:
governance.to_pickle('governance variables including original columns (n=7,133).pkl')
# ##### Create subset of columns in preparation for merge
# In[540]:
print governance.columns.tolist()
# In[542]:
governance_cols = ['EIN', 'FYE',
'independent_directors_num', 'independent_directors_pct', 'independent_directors_pct_v2',
'voting_directors',
'outsourced_mgt', '990_review', 'audit_committee',
'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions',
'restricted_donations',
'tax_exempt_bond', 'taxable_revenue', 'taxable_revenue_binary',
'taxable_revenue_neg_or_pos_rev_binary',
'audited_financials', 'independent_audited_fs',
'federal_grant_audit_performed', 'federal_grant_audit_required',
'FeesForServicesAccounting_binary',
'no_relations',
'elected_board',
'government_grant',
]
# ### Merge with Model 4 Data
# In[394]:
df = pd.read_pickle('2016 - Test 4 data.pkl')
print "Number of columns:", len(df.columns)
print "Number of observations:", len(df)
df.head(1)
# In[543]:
df_test4 = pd.read_stata('Test 4 data.dta')
print "Number of columns:", len(df_test4.columns)
print "Number of observations:", len(df_test4)
df_test4.head(1)
# In[544]:
print set(df.columns.tolist()) - set(df_test4.columns.tolist()), '\n'
print set(df_test4.columns.tolist()) - set(df.columns.tolist())
# In[405]:
#print len(df.columns)
#print len(pd.merge(df, governance[governance_cols], left_on='EIN', right_on='EIN', how='left', indicator=True)), '\n'
#merged = pd.merge(df, governance[governance_cols], left_on='EIN', right_on='EIN', how='left', indicator=True)
#print len(merged.columns)
#print len(merged), '\n'
#print merged['_merge'].value_counts()
# In[545]:
print len(df.columns)
print len(pd.merge(df_test4, governance[governance_cols], left_on='ein', right_on='EIN', how='left', indicator=True)), '\n'
merged = pd.merge(df_test4, governance[governance_cols], left_on='ein', right_on='EIN', how='left', indicator=True)
print len(merged.columns)
print len(merged), '\n'
print merged['_merge'].value_counts()
# In[560]:
#merged = merged.drop('_merge', 1)
# In[562]:
'''
print len(merged.columns)
print len(pd.merge(merged, governance2[['EIN', 'zero_FR']], left_on='ein', right_on='EIN', how='left', indicator=True)), '\n'
merged = pd.merge(merged, governance2[['EIN', 'zero_FR']], left_on='ein', right_on='EIN', how='left', indicator=True)
print len(merged.columns)
print len(merged), '\n'
print merged['_merge'].value_counts()
'''
# In[563]:
merged[:1]
# In[416]:
#merged.rename(columns={'FYE_x':'FYE'}, inplace=True)
# In[418]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# ##### Descriptives for our governance variables
# In[439]:
merged[:1]
# ##### Save DF
# In[565]:
merged.to_pickle('model 4 data with e-file governance variables (n=8,238).pkl')
# In[566]:
merged.to_excel('model 4 data with e-file governance variables (n=8,238).xls')