3522 + 35 + 1032 + 62 + 206
4857
import sys
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3 as lite
from pandas import DataFrame
from pandas import Series
#http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 250)
from __future__ import division
cd '/Users/gregorysaxton/Google Drive/SOX'
/Users/gregorysaxton/Google Drive/SOX
#f = open('2016 donor advisory EINs.json', 'r')
#advisories_2016 = json.load(f)
#print len(advisories_2016)
#print advisories_2016[:10]
#df = pd.read_pickle('2016 - Test 4 data.pkl')
#print "Number of columns:", len(df.columns)
#print "Number of observations:", len(df)
#df.head(1)
#print len(set(df['EIN'].tolist()))
#eins_2016 = df['EIN'].tolist()
#print len(eins_2016), len(set(eins_2016))
#print eins_2016[:5]
df = pd.read_pickle('2016 - Test 4 data.pkl')
print "Number of columns:", len(df.columns)
print "Number of observations:", len(df)
df.head(1)
Number of columns: 35 Number of observations: 8304
donor_advisory | donor_advisory_2016 | donor_advisory_2011_to_2016 | org_id | EIN | FYE | Form 990 FYE | ratings_system | 2011_data | 2016_data | conflict_of_interest_policy_v2 | records_retention_policy_v2 | whistleblower_policy_v2 | SOX_policies | SOX_policies_binary | SOX_policies_all_binary | program_efficiency | complexity | complexity_2011 | age | total_revenue_logged | category | state | tot_rev | category_Animals | category_Arts, Culture, Humanities | category_Community Development | category_Education | category_Environment | category_Health | category_Human Services | category_Human and Civil Rights | category_International | category_Religion | category_Research and Public Policy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
50709 | 0.0 | 0.0 | 0.0 | 5954 | 010202467 | FY2014 | 2014-12 | CN 2.1 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.0 | 0.794457 | 6.0 | NaN | 62.0 | 16.377993 | Research and Public Policy | ME | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
print len(set(df['EIN'].tolist()))
eins_2016 = df['EIN'].tolist()
print len(eins_2016), len(set(eins_2016))
print eins_2016[:5]
8304 8304 8304 ['010202467', '010211478', '010211513', '010211530', '010211543']
print len(eins_2016)
with open('eins_2016.json', 'w') as outfile:
json.dump(eins_2016, outfile)
#import json
#with open('valid_next_v8.json', 'r') as fp:
# valid_next = json.load(fp)
#print len(valid_next)
#print valid_next[:5]
df_all = pd.read_pickle('e-file data for 7,133 of 8,304 2016 EINs.pkl')
print '# of columns:', len(df_all.columns)
print '# of observations:', len(df_all)
df_all.head(1)
# of columns: 695 # of observations: 29440
@documentId | @referenceDocumentId | @referenceDocumentName | @softwareId | @softwareVersion | @softwareVersionNum | AccountantCompileOrReview | AccountantCompileOrReviewBasis | AccountantCompileOrReviewInd | AccountsPayableAccrExpnssGrp | AccountsPayableAccruedExpenses | AccountsReceivable | AccountsReceivableGrp | AcctCompileOrReviewBasisGrp | ActivitiesConductedPartnership | ActivitiesConductedPrtshpInd | Activity2 | Activity3 | ActivityCd | ActivityCode | ActivityOrMissionDesc | ActivityOrMissionDescription | ActivityOther | AddressChange | AddressChangeInd | AddressPrincipalOfficerForeign | AddressPrincipalOfficerUS | Advertising | AdvertisingGrp | AllAffiliatesIncluded | AllAffiliatesIncludedInd | AllOtherContributions | AllOtherContributionsAmt | AllOtherExpenses | AllOtherExpensesGrp | AmendedReturn | AmendedReturnInd | AnnualDisclosureCoveredPersons | AnnualDisclosureCoveredPrsnInd | AuditCommittee | AuditCommitteeInd | AuditedFinancialStmtAttInd | BackupWthldComplianceInd | BalanceSheetAmountsReported | BenefitsPaidToMembersCY | BenefitsPaidToMembersPriorYear | BenefitsToMembers | BenefitsToMembersGrp | BooksInCareOfDetail | BsnssRltnshpThruFamilyMember | BsnssRltnshpWithOrganization | BusinessRlnWithFamMemInd | BusinessRlnWithOfficerEntInd | BusinessRlnWithOrgMemInd | CYBenefitsPaidToMembersAmt | CYContributionsGrantsAmt | CYGrantsAndSimilarPaidAmt | CYInvestmentIncomeAmt | CYOtherExpensesAmt | CYOtherRevenueAmt | CYProgramServiceRevenueAmt | CYRevenuesLessExpensesAmt | CYSalariesCompEmpBnftPaidAmt | CYTotalExpensesAmt | CYTotalFundraisingExpenseAmt | CYTotalProfFndrsngExpnsAmt | CYTotalRevenueAmt | CapStckTrstPrinCurrentFunds | CapStkTrPrinCurrentFundsGrp | CashNonInterestBearing | CashNonInterestBearingGrp | ChangeToOrgDocumentsInd | ChangesToOrganizingDocs | CntrbtnsRprtdFundraisingEvents | CntrctRcvdGreaterThan100KCnt | CollectionsOfArt | CollectionsOfArtInd | CompCurrentOfcrDirectorsGrp | CompCurrentOfficersDirectors | CompDisqualPersons | CompDisqualPersonsGrp | CompensationFromOtherSources | CompensationFromOtherSrcsInd | CompensationProcessCEO | CompensationProcessCEOInd | CompensationProcessOther | CompensationProcessOtherInd | ComplianceWithBackupWitholding | ConferencesMeetings | ConferencesMeetingsGrp | ConflictOfInterestPolicy | ConflictOfInterestPolicyInd | ConservationEasements | ConservationEasementsInd | ConsolidatedAuditFinancialStmt | ConsolidatedAuditFinclStmtInd | ContractTerminationInd | ContractorCompensation | ContractorCompensationGrp | ContriRptFundraisingEventAmt | ContributionsGrantsCurrentYear | ContributionsGrantsPriorYear | CostOfGoodsSold | CostOfGoodsSoldAmt | CountryLegalDomicile | CreditCounseling | CreditCounselingInd | DAFExcessBusinessHoldingsInd | DLN | DecisionsSubjectToApprovaInd | DecisionsSubjectToApproval | DeductibleArtContributionInd | DeductibleContributionsOfArt | DeductibleNonCashContriInd | DeductibleNonCashContributions | DeferredRevenue | DeferredRevenueGrp | DelegationOfManagementDuties | DelegationOfMgmtDutiesInd | DepreciationDepletion | DepreciationDepletionGrp | Desc | DescribedIn501C3 | DescribedInSection501c3Ind | Description | DisregardedEntity | DisregardedEntityInd | DistributionToDonor | DistributionToDonorInd | DoNotFollowSFAS117 | DocumentRetentionPolicy | DocumentRetentionPolicyInd | DoingBusinessAs | DoingBusinessAsName | DonatedServicesAndUseFcltsAmt | DonorAdvisedFundInd | DonorAdvisedFunds | EIN | ElectionOfBoardMembers | ElectionOfBoardMembersInd | EmployeeCnt | EmploymentTaxReturnsFiled | EmploymentTaxReturnsFiledInd | EngagedInExcessBenefitTransInd | EscrowAccount | EscrowAccountInd | EscrowAccountLiability | EscrowAccountLiabilityGrp | ExcessBenefitTransaction | ExcessBusinessHoldings | ExcessBusinessHoldingsInd | Expense | ExpenseAmt | FSAudited | FSAuditedBasis | FSAuditedBasisGrp | FSAuditedInd | FamilyOrBusinessRelationship | FamilyOrBusinessRlnInd | FederalGrantAuditPerformed | FederalGrantAuditPerformedInd | FederalGrantAuditRequired | FederalGrantAuditRequiredInd | FederatedCampaigns | FederatedCampaignsAmt | FeesForServicesAccounting | FeesForServicesAccountingGrp | FeesForServicesInvstMgmntFees | FeesForServicesLegal | FeesForServicesLegalGrp | FeesForServicesLobbying | FeesForServicesLobbyingGrp | FeesForServicesManagement | FeesForServicesManagementGrp | FeesForServicesOther | FeesForServicesOtherGrp | FeesForServicesProfFundraising | FeesForSrvcInvstMgmntFeesGrp | FiledLieu1041 | FinalReturnInd | FinancialStatementBoth | FinancialStatementConsolidated | FinancialStatementSeparate | FinancialStmtAttached | FollowSFAS117 | ForeignActivities | ForeignActivitiesInd | ForeignAddress | ForeignCountryCd | ForeignFinancialAccount | ForeignFinancialAccountInd | ForeignGrants | ForeignGrantsGrp | ForeignOffice | ForeignOfficeInd | Form1098CFiled | Form1098CFiledInd | Form720Filed | Form8282FiledCnt | Form8282PropertyDisposedOf | Form8282PropertyDisposedOfInd | Form8886TFiled | Form8886TFiledInd | Form8899Filed | Form8899Filedind | Form990-TFiled | Form990PartVIISectionA | Form990PartVIISectionAGrp | Form990ProvidedToGoverningBody | Form990ProvidedToGvrnBodyInd | Form990TFiledInd | FormType | FormationYr | FormerOfcrEmployeesListedInd | FormersListed | FundraisingActivities | FundraisingActivitiesInd | FundraisingAmt | FundraisingDirectExpenses | FundraisingDirectExpensesAmt | FundraisingEvents | FundraisingGrossIncomeAmt | FundsToPayPremiums | GainOrLoss | GainOrLossGrp | Gaming | GamingActivitiesInd | GamingDirectExpenses | GamingDirectExpensesAmt | GamingGrossIncomeAmt | GoverningBodyVotingMembersCnt | GovernmentGrants | GovernmentGrantsAmt | GrantAmt | GrantToRelatedPerson | GrantToRelatedPersonInd | Grants | GrantsAndSimilarAmntsCY | GrantsAndSimilarAmntsPriorYear | GrantsPayable | GrantsPayableGrp | GrantsToDomesticIndividuals | GrantsToDomesticIndividualsGrp | GrantsToDomesticOrgs | GrantsToDomesticOrgsGrp | GrantsToIndividuals | GrantsToIndividualsInd | GrantsToOrganizations | GrantsToOrganizationsInd | GrossAmountSalesAssets | GrossAmountSalesAssetsGrp | GrossIncomeFundraisingEvents | GrossIncomeGaming | GrossReceipts | GrossReceiptsAmt | GrossRents | GrossRentsGrp | GrossSalesOfInventory | GrossSalesOfInventoryAmt | GroupExemptionNum | GroupExemptionNumber | GroupReturnForAffiliates | GroupReturnForAffiliatesInd | Hospital | IRPDocumentCnt | IRPDocumentW2GCnt | IncludeFIN48FootnoteInd | IncmFromInvestBondProceedsGrp | IncomeFromInvestBondProceeds | IndependentAuditFinancialStmt | IndependentAuditFinclStmtInd | IndependentVotingMemberCnt | IndivRcvdGreaterThan100KCnt | IndoorTanningServices | IndoorTanningServicesInd | InfoInScheduleOPartIII | InfoInScheduleOPartIIIInd | InfoInScheduleOPartIX | InfoInScheduleOPartIXInd | InfoInScheduleOPartV | InfoInScheduleOPartVI | InfoInScheduleOPartVII | InfoInScheduleOPartVIII | InfoInScheduleOPartVIIIInd | InfoInScheduleOPartVIIInd | InfoInScheduleOPartVIInd | InfoInScheduleOPartVInd | InfoInScheduleOPartX | InfoInScheduleOPartXI | InfoInScheduleOPartXII | InfoInScheduleOPartXIIInd | InfoInScheduleOPartXIInd | InfoInScheduleOPartXInd | InformationTechnology | InformationTechnologyGrp | InitialReturn | InitialReturnInd | Insurance | InsuranceGrp | IntangibleAssets | IntangibleAssetsGrp | Interest | InterestGrp | InventoriesForSaleOrUse | InventoriesForSaleOrUseGrp | InvestTaxExemptBonds | InvestTaxExemptBondsInd | InvestmentExpenseAmt | InvestmentInJointVenture | InvestmentInJointVentureInd | InvestmentIncome | InvestmentIncomeCurrentYear | InvestmentIncomeGrp | InvestmentIncomePriorYear | InvestmentsOtherSecurities | InvestmentsOtherSecuritiesGrp | InvestmentsProgramRelated | InvestmentsProgramRelatedGrp | InvestmentsPubTradedSecGrp | InvestmentsPubTradedSecurities | IsAvailable | IsElectronic | JointCosts | JointCostsInd | LandBldgEquipAccumDeprecAmt | LandBldgEquipBasisNetGrp | LandBldgEquipCostOrOtherBssAmt | LandBldgEquipmentAccumDeprec | LandBuildingsEquipmentBasis | LandBuildingsEquipmentBasisNet | LastUpdated | LegalDomicileStateCd | LessCostOthBasisSalesExpenses | LessCostOthBasisSalesExpnssGrp | LessRentalExpenses | LessRentalExpensesGrp | LicensedMoreThanOneState | LicensedMoreThanOneStateInd | LoanOutstandingInd | LoanToOfficerOrDQP | LoansFromOfficersDirectors | LoansFromOfficersDirectorsGrp | LobbyingActivities | LobbyingActivitiesInd | LocalChapters | LocalChaptersInd | MaterialDiversionOrMisuse | MaterialDiversionOrMisuseInd | MembersOrStockholders | MembersOrStockholdersInd | MembershipDues | MembershipDuesAmt | MethodOfAccountingAccrual | MethodOfAccountingAccrualInd | MethodOfAccountingCash | MethodOfAccountingCashInd | MethodOfAccountingOther | MethodOfAccountingOtherInd | MinutesOfCommittees | MinutesOfCommitteesInd | MinutesOfGoverningBody | MinutesOfGoverningBodyInd | MiscellaneousRevenue | MiscellaneousRevenueGrp | MissionDesc | MissionDescription | MoreThan5000KToIndividuals | MoreThan5000KToIndividualsInd | MoreThan5000KToOrgInd | MoreThan5000KToOrganizations | MortNotesPyblSecuredInvestProp | MortgNotesPyblScrdInvstPropGrp | NameOfForeignCountry | NameOfPrincipalOfficerBusiness | NameOfPrincipalOfficerPerson | NbrIndependentVotingMembers | NbrVotingGoverningBodyMembers | NbrVotingMembersGoverningBody | NetAssetsOrFundBalancesBOY | NetAssetsOrFundBalancesBOYAmt | NetAssetsOrFundBalancesEOY | NetAssetsOrFundBalancesEOYAmt | NetGainOrLossInvestments | NetGainOrLossInvestmentsGrp | NetIncmFromFundraisingEvtGrp | NetIncomeFromFundraisingEvents | NetIncomeFromGaming | NetIncomeFromGamingGrp | NetIncomeOrLoss | NetIncomeOrLossGrp | NetRentalIncomeOrLoss | NetRentalIncomeOrLossGrp | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | NetUnrlzdGainsLossesInvstAmt | NoListedPersonsCompensated | NoListedPersonsCompensatedInd | NonDeductibleContributions | NonDeductibleDisclosure | NoncashContributions | NoncashContributionsAmt | NondeductibleContriDisclInd | NondeductibleContributionsInd | NumberFormsTransmittedWith1096 | NumberIndependentVotingMembers | NumberIndividualsGT100K | NumberOf8282Filed | NumberOfContractorsGT100K | NumberOfEmployees | NumberW2GIncluded | ObjectId | Occupancy | OccupancyGrp | OfficeExpenses | OfficeExpensesGrp | OfficerEntityWithBsnssRltnshp | OfficerMailingAddress | OfficerMailingAddressInd | OnBehalfOfIssuer | OnBehalfOfIssuerInd | OperateHospitalInd | OrgDoesNotFollowSFAS117Ind | OrgFiledInLieuOfForm1041Ind | Organization501c | Organization501c3 | Organization501c3Ind | Organization501cInd | OrganizationFollowsSFAS117Ind | OrganizationName | OthNotesLoansReceivableNetGrp | OtherAssetsTotal | OtherAssetsTotalGrp | OtherChangesInNetAssetsAmt | OtherEmployeeBenefits | OtherEmployeeBenefitsGrp | OtherExpensePriorYear | OtherExpenses | OtherExpensesCurrentYear | OtherExpensesGrp | OtherExplainInSchO | OtherInd | OtherLiabilities | OtherLiabilitiesGrp | OtherNotesLoansReceivableNet | OtherOrganizationDsc | OtherRevenueCurrentYear | OtherRevenueMisc | OtherRevenueMiscGrp | OtherRevenuePriorYear | OtherRevenueTotalAmt | OtherSalariesAndWages | OtherSalariesAndWagesGrp | OtherWebsite | OtherWebsiteInd | OwnWebsite | OwnWebsiteInd | PYBenefitsPaidToMembersAmt | PYContributionsGrantsAmt | PYExcessBenefitTransInd | PYGrantsAndSimilarPaidAmt | PYInvestmentIncomeAmt | PYOtherExpensesAmt | PYOtherRevenueAmt | PYProgramServiceRevenueAmt | PYRevenuesLessExpensesAmt | PYSalariesCompEmpBnftPaidAmt | PYTotalExpensesAmt | PYTotalProfFndrsngExpnsAmt | PYTotalRevenueAmt | PaidInCapSrplsLandBldgEqpFund | PartialLiquidation | PartialLiquidationInd | PayPremiumsPrsnlBnftCntrctInd | PaymentsToAffiliates | PaymentsToAffiliatesGrp | PayrollTaxes | PayrollTaxesGrp | PdInCapSrplsLandBldgEqpFundGrp | PensionPlanContributions | PensionPlanContributionsGrp | PermanentlyRestrictedNetAssets | PermanentlyRstrNetAssetsGrp | PledgesAndGrantsReceivable | PledgesAndGrantsReceivableGrp | PoliciesReferenceChapters | PoliciesReferenceChaptersInd | PoliticalActivities | PoliticalCampaignActyInd | PremiumsPaid | PrepaidExpensesDeferredCharges | PrepaidExpensesDefrdChargesGrp | PrincipalOfcrBusinessAddress | PrincipalOfcrBusinessName | PrincipalOfficerNm | PriorExcessBenefitTransaction | PriorPeriodAdjustmentsAmt | ProfessionalFundraising | ProfessionalFundraisingInd | ProgSrvcAccomActy2Grp | ProgSrvcAccomActy3Grp | ProgSrvcAccomActyOtherGrp | ProgramServiceRevenue | ProgramServiceRevenueCY | ProgramServiceRevenueGrp | ProgramServiceRevenuePriorYear | ProhibitedTaxShelterTrans | ProhibitedTaxShelterTransInd | PymtTravelEntrtnmntPubOfclGrp | QuidProQuoContriDisclInd | QuidProQuoContributions | QuidProQuoContributionsInd | QuidProQuoDisclosure | RcvFndsToPayPrsnlBnftCntrctInd | RcvblFromDisqualifiedPrsnGrp | ReceivablesFromDisqualPersons | ReceivablesFromOfficersEtc | ReceivablesFromOfficersEtcGrp | ReconcilationDonatedServices | ReconcilationInvestExpenses | ReconcilationOtherChanges | ReconcilationPriorAdjustment | ReconcilationRevenueExpenses | ReconcilationRevenueExpnssAmt | ReconciliationUnrealizedInvest | RegularMonitoringEnforcement | RegularMonitoringEnfrcInd | RelatedEntity | RelatedEntityInd | RelatedOrgControlledEntity | RelatedOrganizationCtrlEntInd | RelatedOrganizations | RelatedOrganizationsAmt | RentalIncomeOrLoss | RentalIncomeOrLossGrp | ReportFin48Footnote | ReportInvestOthSecurities | ReportInvestmentsOtherSecInd | ReportLandBldgEquip | ReportLandBuildingEquipmentInd | ReportOtherAssets | ReportOtherAssetsInd | ReportOtherLiabilities | ReportOtherLiabilitiesInd | ReportProgRelInvest | ReportProgramRelatedInvstInd | ReservesMaintainedAmt | RetainedEarningsEndowmentEtc | Revenue | RevenueAmt | RevenuesLessExpensesCY | RevenuesLessExpensesPriorYear | Royalties | RoyaltiesGrp | RoyaltiesRevenue | RoyaltiesRevenueGrp | RtnEarnEndowmentIncmOthFndsGrp | SalariesEtcCurrentYear | SalariesEtcPriorYear | SavingsAndTempCashInvestments | SavingsAndTempCashInvstGrp | ScheduleBRequired | ScheduleBRequiredInd | ScheduleJRequired | ScheduleJRequiredInd | ScheduleORequired | ScheduleORequiredInd | School | SchoolOperatingInd | SignificantChange | SignificantChangeInd | SignificantNewProgramServices | SignificantNewProgramSrvcInd | SpecialConditionDescription | StateLegalDomicile | StateRequiredReservesAmt | StatesWhereCopyOfReturnIsFiled | StatesWhereCopyOfReturnIsFldCd | SubjectToProxyTax | SubjectToProxyTaxInd | SubmittedOn | TaxExemptBondLiabilities | TaxExemptBondLiabilitiesGrp | TaxExemptBonds | TaxExemptBondsInd | TaxPeriod | TaxableDistributions | TaxableDistributionsInd | TaxablePartyNotification | TaxablePartyNotificationInd | TempOrPermanentEndowmentsInd | TemporarilyRestrictedNetAssets | TemporarilyRstrNetAssetsGrp | TermOrPermanentEndowments | TerminateOperationsInd | Terminated | TerminationOrContraction | TheBooksAreInCareOf | TotLiabNetAssetsFundBalanceGrp | TotReportableCompRltdOrgAmt | TotalAssets | TotalAssetsBOY | TotalAssetsBOYAmt | TotalAssetsEOY | TotalAssetsEOYAmt | TotalAssetsGrp | TotalCompGT150K | TotalCompGreaterThan150KInd | TotalContributions | TotalContributionsAmt | TotalEmployeeCnt | TotalExpensesCurrentYear | TotalExpensesPriorYear | TotalFunctionalExpenses | TotalFunctionalExpensesGrp | TotalFundrsngExpCurrentYear | TotalGrossUBI | TotalGrossUBIAmt | TotalJointCosts | TotalJointCostsGrp | TotalLiabNetAssetsFundBalances | TotalLiabilities | TotalLiabilitiesBOY | TotalLiabilitiesBOYAmt | TotalLiabilitiesEOY | TotalLiabilitiesEOYAmt | TotalLiabilitiesGrp | TotalNbrEmployees | TotalNbrVolunteers | TotalNetAssetsFundBalanceGrp | TotalNetAssetsFundBalances | TotalOfOtherProgramServiceExp | TotalOfOtherProgramServiceGrnt | TotalOfOtherProgramServiceRev | TotalOthProgramServiceRevGrp | TotalOthProgramServiceRevenue | TotalOtherCompensation | TotalOtherCompensationAmt | TotalOtherProgSrvcExpenseAmt | TotalOtherProgSrvcGrantAmt | TotalOtherProgSrvcRevenueAmt | TotalOtherRevenue | TotalProfFundrsngExpCY | TotalProfFundrsngExpPriorYear | TotalProgramServiceExpense | TotalProgramServiceExpensesAmt | TotalProgramServiceRevenue | TotalProgramServiceRevenueAmt | TotalReportableCompFrmRltdOrgs | TotalReportableCompFromOrg | TotalReportableCompFromOrgAmt | TotalRevenue | TotalRevenueCurrentYear | TotalRevenueGrp | TotalRevenuePriorYear | TotalVolunteersCnt | TransactionRelatedEntity | TransactionWithControlEntInd | TransfersToExemptNonChrtblOrg | Travel | TravelEntrtnmntPublicOfficials | TravelGrp | TrnsfrExmptNonChrtblRltdOrgInd | TypeOfOrgOtherDescription | TypeOfOrganizationAssocInd | TypeOfOrganizationAssociation | TypeOfOrganizationCorpInd | TypeOfOrganizationCorporation | TypeOfOrganizationOther | TypeOfOrganizationOtherInd | TypeOfOrganizationTrust | TypeOfOrganizationTrustInd | URL | USAddress | UnrelatedBusIncmOverLimitInd | UnrelatedBusinessIncome | UnrestrictedNetAssets | UnrestrictedNetAssetsGrp | UnsecuredNotesLoansPayable | UnsecuredNotesLoansPayableGrp | UponRequest | UponRequestInd | VotingMembersGoverningBodyCnt | VotingMembersIndependentCnt | WebSite | WebsiteAddressTxt | WhistleblowerPolicy | WhistleblowerPolicyInd | WrittenPolicyOrProcedure | WrittenPolicyOrProcedureInd | YearFormation | _id | FeesForServicesAccountingGrp_v2 | FYE | FeesForServicesAccounting_TotalAmt | FeesForServicesAccounting_binary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18819 | RetDoc1038000001 | RetDoc1044400001 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | {u'BOYAmt': u'506341', u'EOYAmt': u'557069'} | NaN | NaN | {u'BOYAmt': u'315681', u'EOYAmt': u'161137'} | NaN | NaN | 0 | NaN | NaN | NaN | NaN | TO DEVELOP SOLUTIONS TO COMPLEX HUMAN & ENVIRONMENTAL HEALTH PROBLEMS THROUGH RESEARCH & EDUCATION. | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2334773 | NaN | {u'TotalAmt': u'660207', u'ProgramServicesAmt': u'515736', u'ManagementAndGeneralAmt': u'95934', u'FundraisingAmt': u'48537'} | NaN | NaN | NaN | 1 | NaN | 1 | NaN | 1 | NaN | NaN | NaN | NaN | NaN | {u'PersonNm': u'CLAUDINE D LURVEY', u'PhoneNum': u'2072889880', u'USAddress': {u'CityNm': u'SALISBURY COVE', u'StateAbbreviationCd': u'ME', u'ZIPCd': u'04672', u'AddressLine1Txt': u'OLD BAR HARBOR ROAD'}} | NaN | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1042800001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1042800001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1042800001'} | 0 | 10272455 | 0 | 982272 | 6661282 | 1404 | 1711837 | 2136547 | 4155139 | 10831421 | 554608 | 15000 | 12967968 | NaN | NaN | NaN | {u'BOYAmt': u'1121795', u'EOYAmt': u'2540456'} | 0 | NaN | NaN | 1 | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 1 | NaN | 1 | NaN | NaN | NaN | NaN | 1 | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | NaN | {u'ContractorName': {u'BusinessName': {u'BusinessNameLine1Txt': u'GOODIE'S ELECTRICAL SERVICE'}}, u'ServicesDesc': u'ELECTRICIAN', u'ContractorAddress': {u'USAddress': {u'CityNm': u'BAR HARBOR', u'StateAbbreviationCd': u'ME', u'ZIPCd': u'04609', ... | 800 | NaN | NaN | NaN | 0 | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | 93493320080035 | 0 | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1042900001'} | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1042900001'} | NaN | NaN | {u'BOYAmt': u'529444', u'EOYAmt': u'835605'} | NaN | 1 | NaN | {u'TotalAmt': u'1121059', u'ProgramServicesAmt': u'896847', u'ManagementAndGeneralAmt': u'168159', u'FundraisingAmt': u'56053'} | THE MDI BIOLOGICAL LABORATORY IS RECOGNIZED BY THE NATIONAL INSTITUTES OF HEALTH AS A CENTER OF BIOMEDICAL RESEARCH EXCELLENCE (COBRE) FOR OUR INNOVATIVE RESEARCH PROGRAM IN REGENERATIVE AND AGING BIOLOGY AND MEDICINE.OUR APPROACH SPEEDS THE DISC... | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1039100001'} | NaN | NaN | 0 | NaN | NaN | NaN | NaN | 1 | NaN | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | 010202467 | NaN | 1 | 96 | NaN | 1 | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1042800001'} | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 8605100 | NaN | NaN | {u'SeparateBasisFinclStmtInd': u'X'} | 1 | NaN | 0 | NaN | 1 | NaN | 1 | NaN | NaN | NaN | {u'TotalAmt': u'23700', u'ManagementAndGeneralAmt': u'23700'} | NaN | NaN | {u'TotalAmt': u'3792', u'ManagementAndGeneralAmt': u'3792'} | NaN | {u'TotalAmt': u'89167', u'ManagementAndGeneralAmt': u'3379', u'FundraisingAmt': u'85788'} | NaN | NaN | NaN | {u'TotalAmt': u'201518', u'ProgramServicesAmt': u'127950', u'ManagementAndGeneralAmt': u'64909', u'FundraisingAmt': u'8659'} | {u'TotalAmt': u'15000', u'FundraisingAmt': u'15000'} | {u'TotalAmt': u'18902', u'ManagementAndGeneralAmt': u'18902'} | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | [{u'PersonNm': u'DR EDWARD J BENZ', u'IndividualTrusteeOrDirectorInd': u'X', u'AverageHoursPerWeekRt': u'2.00', u'ReportableCompFromRltdOrgAmt': u'0', u'OtherCompensationAmt': u'0', u'TitleTxt': u'VICE CHAIRMAN OF THE BOARD', u'OfficerInd': u'X',... | NaN | 1 | NaN | 990 | 1898 | 0 | NaN | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1041300001'} | 800 | NaN | 37598 | NaN | 38964 | NaN | NaN | {u'OtherAmt': u'322752'} | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1041300001'} | NaN | NaN | NaN | 22 | NaN | 7932282 | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1042800001'} | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 0 | NaN | {u'OtherAmt': u'340000'} | NaN | NaN | NaN | 13022814 | NaN | NaN | NaN | 38 | NaN | NaN | NaN | 0 | NaN | 158 | 0 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | NaN | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 21 | 3 | NaN | 0 | NaN | X | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | X | NaN | NaN | NaN | NaN | X | X | NaN | NaN | {u'TotalAmt': u'49382', u'ProgramServicesAmt': u'8673', u'ManagementAndGeneralAmt': u'40709'} | NaN | NaN | NaN | {u'TotalAmt': u'92396', u'ProgramServicesAmt': u'10397', u'ManagementAndGeneralAmt': u'81746', u'FundraisingAmt': u'253'} | NaN | {u'BOYAmt': u'147691', u'EOYAmt': u'141271'} | NaN | {u'TotalAmt': u'255985', u'ProgramServicesAmt': u'191989', u'ManagementAndGeneralAmt': u'63996'} | NaN | {u'BOYAmt': u'8329'} | NaN | 0 | NaN | NaN | 0 | NaN | NaN | {u'TotalRevenueColumnAmt': u'659520', u'ExclusionAmt': u'659520'} | NaN | NaN | NaN | NaN | NaN | {u'BOYAmt': u'4031973', u'EOYAmt': u'4747115'} | NaN | True | True | NaN | NaN | 11322245 | {u'BOYAmt': u'14025403', u'EOYAmt': u'14020382'} | 25342627 | NaN | NaN | NaN | 2016-04-29T13:40:20 | ME | NaN | {u'OtherAmt': u'17248'} | NaN | NaN | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1042800001'} | NaN | NaN | NaN | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1039700001'} | NaN | 0 | NaN | 0 | NaN | 1 | NaN | 4600 | NaN | X | NaN | NaN | NaN | NaN | NaN | 1 | NaN | 1 | NaN | NaN | THE MDI BIOLOGICAL LABORATORY IS A RAPIDLY GROWING, INDEPENDENT NON-PROFIT BIOMEDICAL RESEARCH INSTITUTION. ITS MISSION IS TO IMPROVE HUMAN HEALTH AND WELL-BEING THROUGH RESEARCH, EDUCATION,AND DEVELOPMENT VENTURES THAT TRANSFORM DISCOVERIES INTO... | NaN | NaN | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 21657900 | NaN | 23690097 | NaN | {u'TotalRevenueColumnAmt': u'322752', u'ExclusionAmt': u'322752'} | {u'TotalRevenueColumnAmt': u'1366', u'ExclusionAmt': u'1366'} | NaN | NaN | NaN | NaN | {u'TotalRevenueColumnAmt': u'38', u'ExclusionAmt': u'38'} | NaN | NaN | 0 | NaN | -104353 | NaN | X | NaN | NaN | NaN | 57474 | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 201533209349308003 | NaN | {u'TotalAmt': u'587738', u'ProgramServicesAmt': u'494365', u'ManagementAndGeneralAmt': u'91234', u'FundraisingAmt': u'2139'} | NaN | NaN | NaN | NaN | 0 | NaN | 0 | 0 | NaN | NaN | NaN | NaN | X | NaN | X | MOUNT DESERT ISLAND BIOLOGICAL LABORATORY | NaN | NaN | NaN | 3 | NaN | {u'TotalAmt': u'370740', u'ProgramServicesAmt': u'284086', u'ManagementAndGeneralAmt': u'70910', u'FundraisingAmt': u'15744'} | NaN | NaN | NaN | [{u'TotalAmt': u'2124748', u'ProgramServicesAmt': u'2124748', u'Desc': u'SUBAWARDS'}, {u'TotalAmt': u'682378', u'ProgramServicesAmt': u'627645', u'ManagementAndGeneralAmt': u'47547', u'FundraisingAmt': u'7186', u'Desc': u'SUPPLIES'}, {u'TotalAmt'... | NaN | NaN | NaN | {u'BOYAmt': u'4630000', u'EOYAmt': u'4525000'} | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | {u'TotalAmt': u'3353971', u'ProgramServicesAmt': u'2478596', u'ManagementAndGeneralAmt': u'683266', u'FundraisingAmt': u'192109'} | NaN | X | NaN | X | 0 | 7802139 | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1042800001'} | 0 | 785633 | 6736413 | -13375 | 1591204 | -449665 | 3878853 | 10615266 | 0 | 10165601 | NaN | NaN | 0 | NaN | NaN | NaN | NaN | {u'TotalAmt': u'257923', u'ProgramServicesAmt': u'189245', u'ManagementAndGeneralAmt': u'52592', u'FundraisingAmt': u'16086'} | NaN | NaN | {u'TotalAmt': u'172505', u'ProgramServicesAmt': u'125037', u'ManagementAndGeneralAmt': u'37384', u'FundraisingAmt': u'10084'} | NaN | {u'BOYAmt': u'4193257', u'EOYAmt': u'4433997'} | NaN | {u'BOYAmt': u'797504', u'EOYAmt': u'986621'} | NaN | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1039700001'} | NaN | NaN | {u'BOYAmt': u'4791', u'EOYAmt': u'5301'} | NaN | NaN | DR KEVIN STRANGE PHD | NaN | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1041300001'} | NaN | NaN | NaN | NaN | NaN | [{u'TotalRevenueColumnAmt': u'469300', u'BusinessCd': u'611710', u'RelatedOrExemptFuncIncomeAmt': u'469300', u'Desc': u'CONFER & COURSE FEES'}, {u'TotalRevenueColumnAmt': u'353507', u'BusinessCd': u'541700', u'RelatedOrExemptFuncIncomeAmt': u'353... | NaN | NaN | 0 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2136547 | NaN | NaN | 1 | NaN | 0 | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | NaN | NaN | 1733437 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | {u'BOYAmt': u'6870518', u'EOYAmt': u'7005488'} | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1234500001'} | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1042400001'} | NaN | 1 | NaN | 0 | NaN | 0 | NaN | 0 | NaN | NaN | NaN | NaN | ME | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1039700001'} | 2016-02-16 | NaN | NaN | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | 201412 | NaN | NaN | NaN | 0 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | {u'BOYAmt': u'5880227', u'EOYAmt': u'7351259'} | NaN | 0 | NaN | NaN | NaN | {u'BOYAmt': u'27323685', u'EOYAmt': u'29607771'} | 0 | NaN | NaN | 27323685 | NaN | 29607771 | {u'BOYAmt': u'27323685', u'EOYAmt': u'29607771'} | NaN | 1 | NaN | 10272455 | 96 | NaN | NaN | NaN | {u'TotalAmt': u'10831421', u'ProgramServicesAmt': u'8605100', u'ManagementAndGeneralAmt': u'1671713', u'FundraisingAmt': u'554608'} | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | 5665785 | NaN | 5917674 | {u'BOYAmt': u'5665785', u'EOYAmt': u'5917674'} | NaN | NaN | {u'BOYAmt': u'21657900', u'EOYAmt': u'23690097'} | NaN | NaN | NaN | NaN | {u'TotalRevenueColumnAmt': u'38356', u'RelatedOrExemptFuncIncomeAmt': u'38356'} | NaN | NaN | 67100 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8605100 | NaN | 1711837 | NaN | NaN | 496334 | NaN | NaN | {u'UnrelatedBusinessRevenueAmt': u'0', u'TotalRevenueColumnAmt': u'12967968', u'RelatedOrExemptFuncIncomeAmt': u'1711837', u'ExclusionAmt': u'983676'} | NaN | 125 | NaN | NaN | NaN | NaN | NaN | {u'TotalAmt': u'128657', u'ProgramServicesAmt': u'84416', u'ManagementAndGeneralAmt': u'33781', u'FundraisingAmt': u'10460'} | 0 | NaN | NaN | NaN | X | NaN | NaN | NaN | NaN | NaN | https://s3.amazonaws.com/irs-form-990/201533209349308003_public.xml | {u'CityNm': u'SALISBURY COVE', u'StateAbbreviationCd': u'ME', u'ZIPCd': u'046720035', u'AddressLine1Txt': u'PO BOX 35'} | 0 | NaN | NaN | {u'BOYAmt': u'11584416', u'EOYAmt': u'11904841'} | NaN | NaN | NaN | X | 22 | 21 | NaN | WWW.MDIBL.ORG | NaN | 1 | NaN | NaN | NaN | 58c33e003ffc5a1664e96b73 | {u'TotalAmt': u'23700', u'ManagementAndGeneralAmt': u'23700'} | FY2014 | 23700.0 | 1 |
vars = pd.read_pickle('variable descriptions 2010-2015 990 e-file data (first year only).pkl')
print '# of columns:', len(vars.columns)
print '# of observations:', len(vars)
vars.head(1)
# of columns: 4 # of observations: 652
variable | description | line_number | first year | |
---|---|---|---|---|
0 | AccountantCompileOrReview | Accountant provide compilation or review? | Part XII Line 2a | 2010 |
cols_2013 = ['GoverningBodyVotingMembersCnt', 'IndependentVotingMemberCnt',
'VotingMembersGoverningBodyCnt', 'VotingMembersIndependentCnt']
cols_2010 = ['NbrIndependentVotingMembers', 'NbrVotingGoverningBodyMembers',
'NbrVotingMembersGoverningBody', 'NumberIndependentVotingMembers']
df_all[['FYE', 'EIN'] + cols_2010+cols_2013][:20]
FYE | EIN | NbrIndependentVotingMembers | NbrVotingGoverningBodyMembers | NbrVotingMembersGoverningBody | NumberIndependentVotingMembers | GoverningBodyVotingMembersCnt | IndependentVotingMemberCnt | VotingMembersGoverningBodyCnt | VotingMembersIndependentCnt | |
---|---|---|---|---|---|---|---|---|---|---|
18819 | FY2014 | 010202467 | NaN | NaN | NaN | NaN | 22 | 21 | 22 | 21 |
7965 | FY2013 | 010202467 | NaN | NaN | NaN | NaN | 21 | 21 | 21 | 21 |
8989 | FY2012 | 010202467 | 20 | 20 | 20 | 20 | NaN | NaN | NaN | NaN |
22904 | FY2011 | 010202467 | 23 | 23 | 23 | 23 | NaN | NaN | NaN | NaN |
19649 | FY2010 | 010202467 | 24 | 24 | 24 | 24 | NaN | NaN | NaN | NaN |
28581 | FY2015 | 010211478 | NaN | NaN | NaN | NaN | 18 | 18 | 18 | 18 |
20683 | FY2014 | 010211478 | NaN | NaN | NaN | NaN | 18 | 18 | 18 | 18 |
4797 | FY2013 | 010211478 | 18 | 18 | 18 | 18 | NaN | NaN | NaN | NaN |
15877 | FY2012 | 010211478 | 18 | 18 | 18 | 18 | NaN | NaN | NaN | NaN |
18981 | FY2011 | 010211478 | 19 | 19 | 19 | 19 | NaN | NaN | NaN | NaN |
589 | FY2014 | 010211513 | NaN | NaN | NaN | NaN | 23 | 21 | 23 | 21 |
21460 | FY2013 | 010211513 | NaN | NaN | NaN | NaN | 22 | 20 | 22 | 20 |
4502 | FY2012 | 010211513 | 20 | 22 | 22 | 20 | NaN | NaN | NaN | NaN |
13619 | FY2011 | 010211513 | 21 | 21 | 21 | 21 | NaN | NaN | NaN | NaN |
17018 | FY2010 | 010211513 | 24 | 25 | 25 | 24 | NaN | NaN | NaN | NaN |
28147 | FY2014 | 010211530 | NaN | NaN | NaN | NaN | 28 | 28 | 28 | 28 |
6614 | FY2013 | 010211530 | 29 | 29 | 29 | 29 | NaN | NaN | NaN | NaN |
27887 | FY2012 | 010211530 | 27 | 27 | 27 | 27 | NaN | NaN | NaN | NaN |
13438 | FY2011 | 010211530 | 29 | 29 | 29 | 29 | NaN | NaN | NaN | NaN |
17003 | FY2010 | 010211530 | 29 | 29 | 29 | 29 | NaN | NaN | NaN | NaN |
print len(df_all[(df_all['IndependentVotingMemberCnt'].notnull())])
print len(df_all[(df_all['VotingMembersIndependentCnt'].notnull())])
print len(df_all[(df_all['IndependentVotingMemberCnt'].isnull()) & (df_all['VotingMembersIndependentCnt'].notnull())])
print len(df_all[(df_all['VotingMembersIndependentCnt'].isnull()) & (df_all['IndependentVotingMemberCnt'].notnull())]), '\n'
print len(df_all[(df_all['GoverningBodyVotingMembersCnt'].isnull()) & (df_all['VotingMembersGoverningBodyCnt'].notnull())])
print len(df_all[(df_all['VotingMembersGoverningBodyCnt'].isnull()) & (df_all['GoverningBodyVotingMembersCnt'].notnull())])
11143 11143 0 0 0 0
Based on the above we can drop the first two for each year -- we'll go with the Section I, Line 3 and 4 versions, which seem to be the same
cols = ['EIN', 'FYE']
#cols_2013 = ['VotingMembersGoverningBodyCnt', 'VotingMembersIndependentCnt']
#cols_2010 = ['NbrIndependentVotingMembers', 'NbrVotingMembersGoverningBody']
cols = cols + ['VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody',
'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers']
cols = cols + ['DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties']
cols = cols + ['Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody']
cols = cols + ['AuditCommitteeInd', 'AuditCommittee']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee']
7351259+4433997+11904841
23690097
What we see here is that TemporarilyRstrNetAssetsGrp + PermanentlyRstrNetAssetsGrp + UnrestrictedNetAssetsGrp = TotalNetAssetsFundBalanceGrp = NetAssetsOrFundBalancesEOYAmt
asset_cols = ['TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']
df_all[['EIN', 'FYE']+asset_cols][:10]
EIN | FYE | TemporarilyRstrNetAssetsGrp | TemporarilyRestrictedNetAssets | PermanentlyRstrNetAssetsGrp | PermanentlyRestrictedNetAssets | UnrestrictedNetAssetsGrp | UnrestrictedNetAssets | TotalNetAssetsFundBalanceGrp | TotalNetAssetsFundBalances | NetAssetsOrFundBalancesBOYAmt | NetAssetsOrFundBalancesBOY | NetAssetsOrFundBalancesEOYAmt | NetAssetsOrFundBalancesEOY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18819 | 010202467 | FY2014 | {u'BOYAmt': u'5880227', u'EOYAmt': u'7351259'} | NaN | {u'BOYAmt': u'4193257', u'EOYAmt': u'4433997'} | NaN | {u'BOYAmt': u'11584416', u'EOYAmt': u'11904841'} | NaN | {u'BOYAmt': u'21657900', u'EOYAmt': u'23690097'} | NaN | 21657900 | NaN | 23690097 | NaN |
7965 | 010202467 | FY2013 | {u'BOYAmt': u'5773597', u'EOYAmt': u'5880227'} | NaN | {u'BOYAmt': u'4153627', u'EOYAmt': u'4193257'} | NaN | {u'BOYAmt': u'12203695', u'EOYAmt': u'11584416'} | NaN | {u'BOYAmt': u'22130919', u'EOYAmt': u'21657900'} | NaN | 22130919 | NaN | 21657900 | NaN |
8989 | 010202467 | FY2012 | NaN | {u'BOY': u'6554891', u'EOY': u'5773597'} | NaN | {u'BOY': u'3096087', u'EOY': u'4153627'} | NaN | {u'BOY': u'10835720', u'EOY': u'12203695'} | NaN | {u'BOY': u'20486698', u'EOY': u'22130919'} | NaN | 20486698 | NaN | 22130919 |
22904 | 010202467 | FY2011 | NaN | {u'BOY': u'7110362', u'EOY': u'6554891'} | NaN | {u'BOY': u'2873816', u'EOY': u'3096087'} | NaN | {u'BOY': u'7951722', u'EOY': u'10835720'} | NaN | {u'BOY': u'17935900', u'EOY': u'20486698'} | NaN | 17935900 | NaN | 20486698 |
19649 | 010202467 | FY2010 | NaN | {u'BOY': u'7291747', u'EOY': u'7110362'} | NaN | {u'BOY': u'2824416', u'EOY': u'2873816'} | NaN | {u'BOY': u'7878008', u'EOY': u'7951722'} | NaN | {u'BOY': u'17994171', u'EOY': u'17935900'} | NaN | 17994171 | NaN | 17935900 |
28581 | 010211478 | FY2015 | {u'BOYAmt': u'889233', u'EOYAmt': u'908556'} | NaN | {u'BOYAmt': u'2717476', u'EOYAmt': u'2717000'} | NaN | {u'BOYAmt': u'1254972', u'EOYAmt': u'1179056'} | NaN | {u'BOYAmt': u'4861681', u'EOYAmt': u'4804612'} | NaN | 4861681 | NaN | 4804612 | NaN |
20683 | 010211478 | FY2014 | {u'BOYAmt': u'614778', u'EOYAmt': u'889233'} | NaN | {u'BOYAmt': u'2669412', u'EOYAmt': u'2717476'} | NaN | {u'BOYAmt': u'1291239', u'EOYAmt': u'1254972'} | NaN | {u'BOYAmt': u'4575429', u'EOYAmt': u'4861681'} | NaN | 4575429 | NaN | 4861681 | NaN |
4797 | 010211478 | FY2013 | NaN | {u'BOY': u'563731', u'EOY': u'614778'} | NaN | {u'BOY': u'2652247', u'EOY': u'2669412'} | NaN | {u'BOY': u'1362377', u'EOY': u'1291239'} | NaN | {u'BOY': u'4578355', u'EOY': u'4575429'} | NaN | 4578355 | NaN | 4575429 |
15877 | 010211478 | FY2012 | NaN | {u'BOY': u'511917', u'EOY': u'563731'} | NaN | {u'BOY': u'2662228', u'EOY': u'2652247'} | NaN | {u'BOY': u'1383839', u'EOY': u'1362377'} | NaN | {u'BOY': u'4557984', u'EOY': u'4578355'} | NaN | 4557984 | NaN | 4578355 |
18981 | 010211478 | FY2011 | NaN | {u'BOY': u'586298', u'EOY': u'511917'} | NaN | {u'BOY': u'406957', u'EOY': u'2662228'} | NaN | {u'BOY': u'928373', u'EOY': u'1383839'} | NaN | {u'BOY': u'1921628', u'EOY': u'4557984'} | NaN | 1921628 | NaN | 4557984 |
#cols = cols + ['TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
#'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
#'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
#'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']
#print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']
print df_all.columns.tolist()[-20:]
[u'UponRequest', u'UponRequestInd', u'VotingMembersGoverningBodyCnt', u'VotingMembersIndependentCnt', u'WebSite', u'WebsiteAddressTxt', u'WhistleblowerPolicy', u'WhistleblowerPolicyInd', u'WrittenPolicyOrProcedure', u'WrittenPolicyOrProcedureInd', u'YearFormation', u'_id', 'FeesForServicesAccountingGrp_v2', 'FYE', 'FeesForServicesAccounting_TotalAmt', 'FeesForServicesAccounting_binary', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions']
cols = cols + ['perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions']
cols = cols + ['TaxExemptBondsInd', 'TaxExemptBonds']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds']
cols = cols + ['NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome']
#df_all['AuditedFinancialStmtAttInd'].value_counts()
{u'#text': u'true', u'@referenceDocumentId': u'BinaryAttach BinaryAttachN1 BinaryAttachN2'} 2 {u'#text': u'true', u'@referenceDocumentId': u'BinaryAttach BinaryAttachN1'} 2 {u'#text': u'true', u'@referenceDocumentId': u'990:103'} 2 {u'#text': u'true', u'@referenceDocumentId': u'RetDoc3'} 2 1 1 {u'#text': u'1', u'@referenceDocumentId': u'RetDoc2317200001'} 1 Name: AuditedFinancialStmtAttInd, dtype: int64
audit_cols = [#'AuditedFinancialStmtAttInd',
#'ConsolidatedAuditFinclStmtInd', 'ConsolidatedAuditFinancialStmt',
'FSAuditedInd', 'FSAudited', #'FSAuditedBasisGrp', 'FSAuditedBasis',
'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed',
'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired',
'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', ]
df_all[['EIN', 'FYE']+ audit_cols][:8]
EIN | FYE | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | |
---|---|---|---|---|---|---|---|---|---|---|
18819 | 010202467 | FY2014 | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN |
7965 | 010202467 | FY2013 | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN |
8989 | 010202467 | FY2012 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} |
22904 | 010202467 | FY2011 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} |
19649 | 010202467 | FY2010 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1039900001'} |
28581 | 010211478 | FY2015 | true | NaN | NaN | NaN | false | NaN | {u'#text': u'true', u'@referenceDocumentId': u'RetDoc3'} | NaN |
20683 | 010211478 | FY2014 | true | NaN | NaN | NaN | false | NaN | {u'#text': u'true', u'@referenceDocumentId': u'RetDoc4'} | NaN |
4797 | 010211478 | FY2013 | NaN | true | NaN | NaN | NaN | false | NaN | {u'#text': u'true', u'@referenceDocumentId': u'RetDoc3'} |
cols = cols + ['FSAuditedInd', 'FSAudited',
'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed',
'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired',
'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', ]
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt']
I NEED TO DO IT BEFORE COLLAPSING -- OTHERWISE IT WILL CAUSE PROBLEMS FOR AN EIN THAT HAS NO PERMANENTLY RESTRICTED ASSETS IN ONE YEAR, SAY, BUT IT DOES THE NEXT -- THE GROUPBY FUNCTION WILL BRING THOSE INTO THE SAME ROW.
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in df_all[:].iterrows():
perm_rest = 0
if pd.notnull(row['PermanentlyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['PermanentlyRestrictedNetAssets']):
print 'going to second variable'
if 'EOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['EOY']
elif 'BOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['BOY']
df_all.ix[index, 'perm_rest_assets'] = perm_rest
clear_output()
print ('Index: ', index, 'Perm. Restr Assets:', perm_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print df_all['perm_rest_assets'].value_counts().sum()
('Index: ', 26647, 'Perm. Restr Assets:', 0) # of minutes: 1.63736418486 29440
print len(df_all[df_all['perm_rest_assets'].notnull()])
print len(df_all[df_all['perm_rest_assets'].isnull()])
print df_all['perm_rest_assets'].describe(), '\n'
df_all['perm_rest_assets'] = df_all['perm_rest_assets'].astype('int')
print df_all['perm_rest_assets'].describe()
29440 0 count 29440 unique 10252 top 0 freq 13659 Name: perm_rest_assets, dtype: int64 count 2.944000e+04 mean 5.853230e+06 std 3.626274e+07 min 0.000000e+00 25% 0.000000e+00 50% 2.500000e+03 75% 1.215208e+06 max 1.097021e+09 Name: perm_rest_assets, dtype: float64
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in df_all[:].iterrows():
temp_rest = 0
if pd.notnull(row['TemporarilyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['TemporarilyRestrictedNetAssets']):
print 'going to second'
if 'EOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['EOY']
elif 'BOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['BOY']
df_all.ix[index, 'temp_rest_assets'] = temp_rest
clear_output()
print ('Index: ', index, 'Temp. Restr Assets:', temp_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print df_all['temp_rest_assets'].value_counts().sum()
('Index: ', 26647, 'Temp. Restr Assets:', u'96956') # of minutes: 1.76544401646 29440
print len(df_all[df_all['temp_rest_assets'].notnull()])
print len(df_all[df_all['temp_rest_assets'].isnull()])
print df_all['temp_rest_assets'].describe(), '\n'
df_all['temp_rest_assets'] = df_all['temp_rest_assets'].astype('int')
print df_all['temp_rest_assets'].describe()
29440 0 count 29440 unique 24978 top 0 freq 2849 Name: temp_rest_assets, dtype: int64 count 2.944000e+04 mean 7.365323e+06 std 3.905838e+07 min -2.158587e+06 25% 1.274518e+05 50% 6.781395e+05 75% 2.880510e+06 max 1.473751e+09 Name: temp_rest_assets, dtype: float64
df_all['net_assets'] = df_all['NetAssetsOrFundBalancesEOYAmt']
df_all['net_assets'] = np.where(df_all['net_assets'].isnull(),
df_all['NetAssetsOrFundBalancesEOY'], df_all['net_assets'])
print df_all['net_assets'].describe(), '\n'
df_all['net_assets'] = df_all['net_assets'].astype('int')
print df_all['net_assets'].describe()
count 29440 unique 28932 top 0 freq 41 Name: net_assets, dtype: object count 2.944000e+04 mean 3.175411e+07 std 1.712160e+08 min -1.399548e+08 25% 1.734802e+06 50% 5.309920e+06 75% 1.683359e+07 max 8.116399e+09 Name: net_assets, dtype: float64
df_all['donor_restrictions'] = (df_all['temp_rest_assets']+df_all['perm_rest_assets'])/df_all['net_assets']
df_all['donor_restrictions'].describe()
count 2.940000e+04 mean inf std NaN min -1.564426e+02 25% 6.059707e-02 50% 2.498250e-01 75% 5.590346e-01 max inf Name: donor_restrictions, dtype: float64
df_all.dtypes[-5:]
FeesForServicesAccounting_binary int64 perm_rest_assets int64 temp_rest_assets int64 net_assets int64 donor_restrictions float64 dtype: object
THERE ARE 40 MISSING CASES -- ASSIGN THEM A VALUE OF ZERO
print len(df_all[df_all['donor_restrictions'].isnull()])
40
df_all['donor_restrictions'] = np.where(df_all['donor_restrictions'].isnull(), 0, df_all['donor_restrictions'])
df_all['donor_restrictions'].describe()
count 2.944000e+04 mean inf std NaN min -1.564426e+02 25% 5.999956e-02 50% 2.490086e-01 75% 5.585148e-01 max inf Name: donor_restrictions, dtype: float64
print len(df_all[df_all['donor_restrictions'].isnull()])
0
CHECK A FEW >1 ROWS
df_all[['FYE', 'donor_restrictions', 'temp_rest_assets', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'perm_rest_assets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'net_assets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY',
#'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
#'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']][-6:]
FYE | donor_restrictions | temp_rest_assets | TemporarilyRstrNetAssetsGrp | TemporarilyRestrictedNetAssets | perm_rest_assets | PermanentlyRstrNetAssetsGrp | PermanentlyRestrictedNetAssets | net_assets | NetAssetsOrFundBalancesEOYAmt | NetAssetsOrFundBalancesEOY | TotalNetAssetsFundBalanceGrp | TotalNetAssetsFundBalances | NetAssetsOrFundBalancesEOYAmt | NetAssetsOrFundBalancesEOY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13277 | FY2011 | 0.011886 | 45000 | NaN | {u'BOY': u'73257', u'EOY': u'45000'} | 0 | NaN | NaN | 3785879 | NaN | 3785879 | NaN | {u'BOY': u'3228716', u'EOY': u'3785879'} | NaN | 3785879 |
616 | FY2014 | 0.143153 | 52310 | {u'BOYAmt': u'91351', u'EOYAmt': u'52310'} | NaN | 284363 | {u'BOYAmt': u'207970', u'EOYAmt': u'284363'} | NaN | 2351842 | 2351842 | NaN | {u'BOYAmt': u'2321970', u'EOYAmt': u'2351842'} | NaN | 2351842 | NaN |
8573 | FY2013 | 0.128908 | 91351 | {u'BOYAmt': u'467396', u'EOYAmt': u'91351'} | NaN | 207970 | {u'BOYAmt': u'100000', u'EOYAmt': u'207970'} | NaN | 2321970 | 2321970 | NaN | {u'BOYAmt': u'2441674', u'EOYAmt': u'2321970'} | NaN | 2321970 | NaN |
23577 | FY2012 | 0.232380 | 467396 | NaN | {u'BOY': u'171880', u'EOY': u'467396'} | 100000 | NaN | {u'BOY': u'0', u'EOY': u'100000'} | 2441674 | NaN | 2441674 | NaN | {u'BOY': u'1953372', u'EOY': u'2441674'} | NaN | 2441674 |
16294 | FY2011 | 0.087991 | 171880 | NaN | {u'BOY': u'96956', u'EOY': u'171880'} | 0 | NaN | NaN | 1953372 | NaN | 1953372 | NaN | {u'BOY': u'1720152', u'EOY': u'1953372'} | NaN | 1953372 |
26647 | FY2010 | 0.056365 | 96956 | NaN | {u'BOY': u'79000', u'EOY': u'96956'} | 0 | NaN | NaN | 1720152 | NaN | 1720152 | NaN | {u'BOY': u'1413351', u'EOY': u'1720152'} | NaN | 1720152 |
df_all[['FYE', 'donor_restrictions', 'temp_rest_assets', 'perm_rest_assets', 'net_assets']][-6:]
FYE | donor_restrictions | temp_rest_assets | perm_rest_assets | net_assets | |
---|---|---|---|---|---|
13277 | FY2011 | 0.011886 | 45000 | 0 | 3785879 |
616 | FY2014 | 0.143153 | 52310 | 284363 | 2351842 |
8573 | FY2013 | 0.128908 | 91351 | 207970 | 2321970 |
23577 | FY2012 | 0.232380 | 467396 | 100000 | 2441674 |
16294 | FY2011 | 0.087991 | 171880 | 0 | 1953372 |
26647 | FY2010 | 0.056365 | 96956 | 0 | 1720152 |
print len(df_all[df_all['donor_restrictions']>.99])
print len(df_all[df_all['donor_restrictions']>1])
df_all[df_all['donor_restrictions']>1][['FYE', 'donor_restrictions', 'temp_rest_assets', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'perm_rest_assets', 'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'net_assets', 'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY',
#'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
#'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']][-6:]
1230 1117
FYE | donor_restrictions | temp_rest_assets | TemporarilyRstrNetAssetsGrp | TemporarilyRestrictedNetAssets | perm_rest_assets | PermanentlyRstrNetAssetsGrp | PermanentlyRestrictedNetAssets | net_assets | NetAssetsOrFundBalancesEOYAmt | NetAssetsOrFundBalancesEOY | TotalNetAssetsFundBalanceGrp | TotalNetAssetsFundBalances | NetAssetsOrFundBalancesEOYAmt | NetAssetsOrFundBalancesEOY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4768 | FY2012 | 1.029663 | 261414 | NaN | {u'BOY': u'589365', u'EOY': u'261414'} | 0 | NaN | NaN | 253883 | NaN | 253883 | NaN | {u'BOY': u'610579', u'EOY': u'253883'} | NaN | 253883 |
3425 | FY2014 | 1.162178 | 124675 | {u'BOYAmt': u'271888', u'EOYAmt': u'124675'} | NaN | 0 | NaN | NaN | 107277 | 107277 | NaN | {u'BOYAmt': u'440094', u'EOYAmt': u'107277'} | NaN | 107277 | NaN |
25435 | FY2014 | 1.426897 | 312031 | {u'BOYAmt': u'94893', u'EOYAmt': u'312031'} | NaN | 0 | NaN | NaN | 218678 | 218678 | NaN | {u'BOYAmt': u'608360', u'EOYAmt': u'218678'} | NaN | 218678 | NaN |
1853 | FY2014 | 1.013958 | 685532 | {u'BOYAmt': u'499406', u'EOYAmt': u'685532'} | NaN | 0 | NaN | NaN | 676095 | 676095 | NaN | {u'BOYAmt': u'426629', u'EOYAmt': u'676095'} | NaN | 676095 | NaN |
1872 | FY2014 | 1.001405 | 2144621 | {u'BOYAmt': u'1579409', u'EOYAmt': u'2144621'} | NaN | 6644457 | {u'BOYAmt': u'6501148', u'EOYAmt': u'6644457'} | NaN | 8776750 | 8776750 | NaN | {u'BOYAmt': u'8480604', u'EOYAmt': u'8776750'} | NaN | 8776750 | NaN |
1938 | FY2014 | 1.001405 | 2144621 | {u'BOYAmt': u'1579409', u'EOYAmt': u'2144621'} | NaN | 6644457 | {u'BOYAmt': u'6501148', u'EOYAmt': u'6644457'} | NaN | 8776750 | 8776750 | NaN | {u'BOYAmt': u'8480604', u'EOYAmt': u'8776750'} | NaN | 8776750 | NaN |
df_all['FeesForServicesAccountingGrp_v2'] = np.where(df_all['FeesForServicesAccountingGrp'].notnull(),
df_all['FeesForServicesAccountingGrp'],
df_all['FeesForServicesAccounting'])
print len(df_all[df_all['FeesForServicesAccountingGrp_v2'].isnull()]), '\n'
print len(df_all[df_all['FeesForServicesAccountingGrp_v2'].notnull()]), '\n'
3723 25717
for index, row in df_all[df_all['FeesForServicesAccountingGrp_v2'].notnull()][:].iterrows():
#print type(row['FeesForServicesAccountingGrp_v2']), row['FeesForServicesAccountingGrp_v2']
if 'Total' in row['FeesForServicesAccountingGrp_v2']:
#print 'THERE IS "Total"', row['FeesForServicesAccountingGrp_v2']['Total']
df_all.ix[index, 'FeesForServicesAccounting_TotalAmt'] = row['FeesForServicesAccountingGrp_v2']['Total']
if 'TotalAmt' in row['FeesForServicesAccountingGrp_v2']:
#print 'THERE IS "TotalAmt"', row['FeesForServicesAccountingGrp_v2']['TotalAmt']
df_all.ix[index, 'FeesForServicesAccounting_TotalAmt'] = row['FeesForServicesAccountingGrp_v2']['TotalAmt']
clear_output()
print ('Index: ', index, 'Fees:', df_all.ix[index, 'FeesForServicesAccounting_TotalAmt']), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
df_all['FeesForServicesAccounting_TotalAmt'].value_counts().sum()
('Index: ', 26647, 'Fees:', u'21614') # of minutes: 23.574959151
25717
df_all['FeesForServicesAccounting_TotalAmt'] = df_all['FeesForServicesAccounting_TotalAmt'].astype('float')
df_all['FeesForServicesAccounting_binary'] = np.where(df_all['FeesForServicesAccounting_TotalAmt']>0, 1,0)
df_all['FeesForServicesAccounting_binary'].value_counts()
1 24439 0 5001 Name: FeesForServicesAccounting_binary, dtype: int64
print '# of rows where fees less than 0:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']<0]), '\n'
print '# of rows where fees equal zero:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']==0]), '\n'
print '# of rows where fees greater than zero:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt']>0]), '\n'
print '# of rows missing values:', len(df_all[df_all['FeesForServicesAccounting_TotalAmt'].isnull()]), '\n'
# of rows where fees less than 0: 4 # of rows where fees equal zero: 1274 # of rows where fees greater than zero: 24439 # of rows missing values: 3723
print '# of rows that should have value of zero:', 4+1274+3723
# of rows that should have value of zero: 5001
df_all[df_all['FeesForServicesAccounting_TotalAmt']<0][:][['FeesForServicesAccounting_TotalAmt']]
FeesForServicesAccounting_TotalAmt | |
---|---|
25894 | -2500.0 |
24536 | -67439.0 |
1466 | -48284.0 |
12588 | -44250.0 |
df_all['FeesForServicesAccounting_binary'].value_counts()
1 24439 0 5001 Name: FeesForServicesAccounting_binary, dtype: int64
cols = cols + ['FeesForServicesAccounting_binary']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary']
print len(df_all[df_all['donor_restrictions']>100])
df_all[df_all['donor_restrictions']>100][cols]
1
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
27377 | 840516736 | FY2014 | 10 | NaN | 9 | NaN | false | NaN | true | NaN | true | NaN | 260654 | 0 | 0 | inf | false | NaN | NaN | NaN | true | NaN | NaN | NaN | false | NaN | {u'#text': u'true', u'@referenceDocumentId': u'00000004', u'@referenceDocumentName': u'IRS990ScheduleD'} | NaN | 0 |
cols = cols + ['FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary', 'FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship']
cols = cols + ['TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary', 'FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship', 'TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities']
cols = cols + ['GovernmentGrantsAmt', 'GovernmentGrants']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary', 'FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship', 'TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities', 'GovernmentGrantsAmt', 'GovernmentGrants']
cols = cols + ['ElectionOfBoardMembersInd', 'ElectionOfBoardMembers']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary', 'FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship', 'TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities', 'GovernmentGrantsAmt', 'GovernmentGrants', 'ElectionOfBoardMembersInd', 'ElectionOfBoardMembers']
cols = cols + ['CYTotalFundraisingExpenseAmt', 'TotalFundrsngExpCurrentYear']
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary', 'FamilyOrBusinessRlnInd', 'FamilyOrBusinessRelationship', 'TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities', 'GovernmentGrantsAmt', 'GovernmentGrants', 'ElectionOfBoardMembersInd', 'ElectionOfBoardMembers', 'CYTotalFundraisingExpenseAmt', 'TotalFundrsngExpCurrentYear']
len(df_all)
29440
df_all.to_pickle('e-file data for 7,133 of 8,304 2016 EINs (n=24940).pkl')
df_all = df_all.sort_values(by=['EIN', 'FYE'], ascending=[1,0])
df_all[:5][cols]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18819 | 010202467 | FY2014 | 22 | NaN | 21 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | 4433997 | 7351259 | 23690097 | 0.50 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | 1 | 0 | NaN | NaN | NaN | 7932282 | NaN | 1 | NaN |
7965 | 010202467 | FY2013 | 21 | NaN | 21 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | 4193257 | 5880227 | 21657900 | 0.47 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | NaN | 1 | 1 | NaN | NaN | NaN | 6258147 | NaN | 1 | NaN |
8989 | 010202467 | FY2012 | NaN | 20 | NaN | 20 | NaN | 1 | NaN | 0 | NaN | 1 | 4153627 | 5773597 | 22130919 | 0.45 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | NaN | 1 | NaN | NaN | NaN | 7295185 | NaN | 1 |
22904 | 010202467 | FY2011 | NaN | 23 | NaN | 23 | NaN | 1 | NaN | 0 | NaN | 1 | 3096087 | 6554891 | 20486698 | 0.47 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | NaN | 1 | NaN | NaN | NaN | 9819406 | NaN | 1 |
19649 | 010202467 | FY2010 | NaN | 24 | NaN | 24 | NaN | 1 | NaN | 1 | NaN | 1 | 2873816 | 7110362 | 17935900 | 0.56 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1039900001'} | 1 | NaN | 1 | NaN | NaN | NaN | 7006788 | NaN | 1 |
df_all[-3:][cols]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
23577 | 990350803 | FY2012 | NaN | 11 | NaN | 11 | NaN | 0 | NaN | 1 | NaN | 1 | 100000 | 467396 | 2441674 | 0.232380 | NaN | 0 | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 |
16294 | 990350803 | FY2011 | NaN | 11 | NaN | 11 | NaN | 0 | NaN | 1 | NaN | 1 | 0 | 171880 | 1953372 | 0.087991 | NaN | 0 | NaN | 0 | NaN | 1 | NaN | 1 | NaN | 1 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 |
26647 | 990350803 | FY2010 | NaN | 7 | NaN | 7 | NaN | 0 | NaN | 1 | NaN | NaN | 0 | 96956 | 1720152 | 0.056365 | NaN | 0 | NaN | 0 | NaN | 0 | NaN | 0 | NaN | 1 | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 |
print len(df_all[cols].groupby('EIN').agg('first'))
governance = df_all[cols].groupby('EIN').agg('first')
print len(governance)
governance = governance.reset_index()
governance[:5]
7133 7133
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | CYTotalFundraisingExpenseAmt | TotalFundrsngExpCurrentYear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 010202467 | FY2014 | 22 | 20 | 21 | 20 | 1 | 1 | 1 | 0 | 1 | 1 | 4433997 | 7351259 | 23690097 | 0.50 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 0 | 1 | NaN | NaN | 7932282 | 7295185 | 1 | 1 | 554608 | 358835 |
1 | 010211478 | FY2015 | 18 | 18 | 18 | 18 | false | false | true | true | true | true | 2717000 | 908556 | 4804612 | 0.75 | false | false | NaN | NaN | true | true | NaN | NaN | false | false | {u'#text': u'true', u'@referenceDocumentId': u'RetDoc3'} | {u'#text': u'true', u'@referenceDocumentId': u'RetDoc3'} | 0 | false | false | NaN | NaN | 42442 | 129755 | false | false | 415743 | 363139 |
2 | 010211513 | FY2014 | 23 | 22 | 21 | 20 | false | false | true | true | true | true | 15329657 | 38570318 | 492985953 | 0.11 | {u'#text': u'true', u'@referenceDocumentId': u'IRS990ScheduleK'} | {u'#text': u'true', u'@referenceDocumentId': u'IRS990ScheduleK'} | -57641 | -12313 | true | true | true | true | true | true | {u'#text': u'true', u'@referenceDocumentId': u'IRS990ScheduleD'} | {u'#text': u'true', u'@referenceDocumentId': u'IRS990ScheduleD'} | 1 | false | true | {u'BOYAmt': u'106014868', u'EOYAmt': u'104647760'} | {u'BOY': u'77905017', u'EOY': u'109371484'} | 77323736 | 57810755 | false | false | 3474538 | 2402953 |
3 | 010211530 | FY2014 | 28 | 29 | 28 | 29 | 0 | 0 | 0 | 0 | 1 | 1 | 477857 | 1161177 | 12763602 | 0.13 | 0 | 0 | 0 | 0 | 1 | 1 | NaN | NaN | 0 | 0 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 0 | 0 | 0 | NaN | NaN | 237183 | 318383 | 1 | 1 | 181290 | 255685 |
4 | 010211543 | FY2014 | 35 | 34 | 35 | 34 | 0 | 0 | 1 | 1 | 1 | 1 | 779327 | 4259694 | 16702601 | 0.30 | 0 | 0 | 0 | 0 | 1 | 1 | NaN | NaN | 0 | 0 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 0 | 0 | NaN | NaN | 94663 | 113548 | 0 | 0 | 150504 | 103085 |
governance['FYE'].value_counts()
FY2014 5225 FY2015 1578 FY2013 197 FY2012 55 FY2011 41 FY2010 37 Name: FYE, dtype: int64
print cols
['EIN', 'FYE', 'VotingMembersGoverningBodyCnt', 'NbrVotingMembersGoverningBody', 'VotingMembersIndependentCnt', 'NbrIndependentVotingMembers', 'DelegationOfMgmtDutiesInd', 'DelegationOfManagementDuties', 'Form990ProvidedToGvrnBodyInd', 'Form990ProvidedToGoverningBody', 'AuditCommitteeInd', 'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', 'TaxExemptBondsInd', 'TaxExemptBonds', 'NetUnrelatedBusTxblIncmAmt', 'NetUnrelatedBusinessTxblIncome', 'FSAuditedInd', 'FSAudited', 'FederalGrantAuditPerformedInd', 'FederalGrantAuditPerformed', 'FederalGrantAuditRequiredInd', 'FederalGrantAuditRequired', 'IndependentAuditFinclStmtInd', 'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary']
governance['independent_directors_num'] = governance['VotingMembersIndependentCnt']
governance['independent_directors_num'] = np.where(governance['independent_directors_num'].isnull(),
governance['NbrIndependentVotingMembers'], governance['independent_directors_num'])
print len(governance[governance['independent_directors_num'].isnull()])
#print governance['independent_directors_num'].value_counts()
governance['independent_directors_num'] = governance['independent_directors_num'].astype('int')
governance['independent_directors_num'].describe()
0
count 7133.00 mean 21.45 std 34.58 min 0.00 25% 11.00 50% 17.00 75% 26.00 max 2500.00 Name: independent_directors_num, dtype: float64
governance[governance['VotingMembersIndependentCnt'].isnull()][:2]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | independent_directors_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
35 | 010679337 | FY2013 | NaN | 11 | NaN | 10 | NaN | 0 | NaN | 1 | NaN | 1 | 111972495 | 584038945 | 1137256059 | 0.61 | NaN | 0 | NaN | 30871 | NaN | 1 | NaN | NaN | NaN | 0 | NaN | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1039900001'} | 1 | NaN | 1 | NaN | NaN | NaN | 237833 | NaN | 0 | 10 |
65 | 026015642 | FY2012 | NaN | 24 | NaN | 24 | NaN | 0 | NaN | 1 | NaN | 1 | 0 | 243648 | 1047304 | 0.23 | NaN | 0 | NaN | -34607 | NaN | 1 | NaN | NaN | NaN | 0 | NaN | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 0 | NaN | 1 | NaN | NaN | NaN | NaN | NaN | 0 | 24 |
governance['voting_directors'] = governance['VotingMembersGoverningBodyCnt']
governance['voting_directors'] = np.where(governance['voting_directors'].isnull(),
governance['NbrVotingMembersGoverningBody'], governance['voting_directors'])
print len(governance[governance['voting_directors'].isnull()])
#print governance['voting_directors'].value_counts()
governance['voting_directors'] = governance['voting_directors'].astype('int')
0
governance['independent_directors_pct'] = governance['independent_directors_num']/governance['voting_directors']
governance['independent_directors_pct'].describe()
count 7129.00 mean 0.97 std 1.09 min 0.00 25% 0.96 50% 1.00 75% 1.00 max 92.59 Name: independent_directors_pct, dtype: float64
governance[governance['independent_directors_pct'].isnull()]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | independent_directors_num | voting_directors | independent_directors_pct |
---|
governance['independent_directors_pct'] = np.where(governance['independent_directors_pct'].isnull(), 0,
governance['independent_directors_pct'])
governance['independent_directors_pct'].describe()
count 7133.00 mean 0.97 std 1.09 min 0.00 25% 0.96 50% 1.00 75% 1.00 max 92.59 Name: independent_directors_pct, dtype: float64
Second version -- none over 100%
print len(governance[governance['independent_directors_pct']>1])
governance['independent_directors_pct_v2'] = np.where(governance['independent_directors_pct']>1, 1,
governance['independent_directors_pct'])
print len(governance[governance['independent_directors_pct_v2']>1])
print governance['independent_directors_pct_v2'].describe()
11 0 count 7133.00 mean 0.95 std 0.13 min 0.00 25% 0.96 50% 1.00 75% 1.00 max 1.00 Name: independent_directors_pct_v2, dtype: float64
governance['outsourced_mgt'] = governance['DelegationOfMgmtDutiesInd']
governance['outsourced_mgt'] = np.where(governance['outsourced_mgt'].isnull(),
governance['DelegationOfManagementDuties'], governance['outsourced_mgt'])
print len(governance[governance['outsourced_mgt'].isnull()])
print governance['outsourced_mgt'].value_counts()
0 0 4365 false 2636 1 92 true 40 Name: outsourced_mgt, dtype: int64
governance['outsourced_mgt'] = np.where(governance['outsourced_mgt']=='true', 1, governance['outsourced_mgt'])
governance['outsourced_mgt'] = np.where(governance['outsourced_mgt']=='false', 0, governance['outsourced_mgt'])
print governance['outsourced_mgt'].value_counts()
0 4365 0 2636 1 92 1 40 Name: outsourced_mgt, dtype: int64
governance['outsourced_mgt']=governance['outsourced_mgt'].astype('int')
print governance['outsourced_mgt'].value_counts()
0 7001 1 132 Name: outsourced_mgt, dtype: int64
governance['990_review'] = governance['Form990ProvidedToGvrnBodyInd']
governance['990_review'] = np.where(governance['990_review'].isnull(),
governance['Form990ProvidedToGoverningBody'], governance['990_review'])
print len(governance[governance['990_review'].isnull()])
print governance['990_review'].value_counts()
0 1 4088 true 2352 0 369 false 324 Name: 990_review, dtype: int64
governance['990_review'] = np.where(governance['990_review']=='true', 1, governance['990_review'])
governance['990_review'] = np.where(governance['990_review']=='false', 0, governance['990_review'])
governance['990_review'] = governance['990_review'].astype('int')
print governance['990_review'].value_counts()
1 6440 0 693 Name: 990_review, dtype: int64
I'M ASSIGNING A VALUE OF '0' TO THE 138 CASES MISSING VALUES
governance['audit_committee'] = governance['AuditCommitteeInd']
governance['audit_committee'] = np.where(governance['audit_committee'].isnull(),
governance['AuditCommittee'], governance['audit_committee'])
print len(governance[governance['audit_committee'].isnull()])
print governance['audit_committee'].value_counts()
138 1 4205 true 2454 0 189 false 147 Name: audit_committee, dtype: int64
governance['audit_committee'] = np.where(governance['audit_committee']=='true', 1, governance['audit_committee'])
governance['audit_committee'] = np.where(governance['audit_committee']=='false', 0, governance['audit_committee'])
governance['audit_committee'] = np.where(governance['audit_committee'].isnull(), 0, governance['audit_committee'])
governance['audit_committee']=governance['audit_committee'].astype('int')
print governance['audit_committee'].value_counts()
1 6659 0 474 Name: audit_committee, dtype: int64
governance.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
perm_rest_assets | 7133.00 | 5398758.08 | 34888704.49 | 0.00 | 0.00 | 0.00 | 1033359.00 | 1097020543.00 |
temp_rest_assets | 7133.00 | 7347199.39 | 40974295.28 | -1417569.00 | 119999.00 | 660803.00 | 2879306.00 | 1473750677.00 |
net_assets | 7133.00 | 31409548.61 | 175347462.23 | -139954787.00 | 1766750.00 | 5157328.00 | 15828642.00 | 7982613662.00 |
donor_restrictions | 7133.00 | inf | nan | -156.44 | 0.06 | 0.24 | 0.55 | inf |
FeesForServicesAccounting_binary | 7133.00 | 0.83 | 0.38 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 |
independent_directors_num | 7133.00 | 21.45 | 34.58 | 0.00 | 11.00 | 17.00 | 26.00 | 2500.00 |
voting_directors | 7133.00 | 21.70 | 18.27 | 0.00 | 12.00 | 18.00 | 26.00 | 434.00 |
independent_directors_pct | 7133.00 | 0.97 | 1.09 | 0.00 | 0.96 | 1.00 | 1.00 | 92.59 |
independent_directors_pct_v2 | 7133.00 | 0.95 | 0.13 | 0.00 | 0.96 | 1.00 | 1.00 | 1.00 |
outsourced_mgt | 7133.00 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
990_review | 7133.00 | 0.90 | 0.30 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 |
audit_committee | 7133.00 | 0.93 | 0.25 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 |
#print governance['donor_restrictions'].min()
#governance['donor_restrictions'].max()
-156.442553191
inf
print len(governance[governance['donor_restrictions']>100])
governance[governance['donor_restrictions']>100]
1
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | independent_directors_num | voting_directors | independent_directors_pct | independent_directors_pct_v2 | outsourced_mgt | 990_review | audit_committee | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5939 | 840516736 | FY2014 | 10 | 10 | 9 | 9 | false | false | true | true | true | true | 260654 | 0 | 0 | inf | false | false | NaN | NaN | true | true | NaN | NaN | false | false | {u'#text': u'true', u'@referenceDocumentId': u'00000004', u'@referenceDocumentName': u'IRS990ScheduleD'} | {u'#text': u'true', u'@referenceDocumentId': u'00000004', u'@referenceDocumentName': u'IRS990ScheduleD'} | 0 | true | false | NaN | NaN | NaN | 5644 | false | false | 9 | 10 | 0.90 | 0.90 | 0 | 1 | 1 |
print len(governance[governance['donor_restrictions']>10])
6
governance[governance['donor_restrictions']>10]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | independent_directors_num | voting_directors | independent_directors_pct | outsourced_mgt | 990_review | audit_committee | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
435 | 061343149 | FY2015 | 30 | 24 | 28 | 22 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 38420 | 478 | 80.376569 | 0 | 0 | 0 | 0 | 1 | 1 | NaN | NaN | 0 | 0 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 28 | 30 | 0.933333 | 0 | 1 | 1 |
3907 | 521591381 | FY2014 | 13 | 14 | 12 | 13 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 1893106 | 137848 | 13.733286 | 0 | 0 | -1009 | 880 | 1 | 1 | NaN | NaN | 0 | 0 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 12 | 13 | 0.923077 | 0 | 1 | 1 |
4438 | 570761297 | FY2015 | 24 | 23 | 21 | 23 | false | false | false | true | true | true | 224809 | 122436 | 21241 | 16.347865 | false | false | 0 | 0 | true | false | NaN | NaN | false | false | {u'#text': u'true', u'@referenceDocumentId': u'RetDoc4'} | false | 1 | 21 | 24 | 0.875000 | 0 | 0 | 1 |
5358 | 730568096 | FY2014 | 11 | 11 | 11 | 11 | 0 | 0 | 1 | 1 | 1 | 1 | 521821 | 1082185 | 63200 | 25.379842 | 0 | 0 | 0 | 0 | 1 | 1 | NaN | false | 0 | 0 | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 11 | 11 | 1.000000 | 0 | 1 | 1 |
7045 | 954291515 | FY2014 | 6 | 9 | 4 | 7 | 0 | true | 1 | true | 1 | false | 0 | 5844708 | 503140 | 11.616465 | 0 | false | 0 | 0 | 1 | true | 1 | true | 1 | true | {u'#text': u'0', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'true', u'@referenceDocumentId': u'IRS990ScheduleD'} | 0 | 4 | 6 | 0.666667 | 0 | 1 | 1 |
governance['donor_restrictions'] = np.where(governance['donor_restrictions']>100, 0, governance['donor_restrictions'])
governance['donor_restrictions'].describe()
count 7133.00 mean 0.32 std 2.27 min -156.44 25% 0.06 50% 0.24 75% 0.55 max 80.38 Name: donor_restrictions, dtype: float64
governance[:1]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | independent_directors_num | voting_directors | independent_directors_pct | independent_directors_pct_v2 | outsourced_mgt | 990_review | audit_committee | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 010202467 | FY2014 | 22 | 20 | 21 | 20 | 1 | 1 | 1 | 0 | 1 | 1 | 4433997 | 7351259 | 23690097 | 0.50 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 0 | 1 | NaN | NaN | 7932282 | 7295185 | 1 | 1 | 21 | 22 | 0.95 | 0.95 | 1 | 1 | 1 |
governance['restricted_donations'] = np.where(((governance['perm_rest_assets']>1)|
(governance['temp_rest_assets']>1)), 1,0)
governance['restricted_donations'].value_counts()
1 6414 0 719 Name: restricted_donations, dtype: int64
governance[['perm_rest_assets', 'temp_rest_assets', 'restricted_donations']][20:25]
perm_rest_assets | temp_rest_assets | restricted_donations | |
---|---|---|---|
20 | 23608300 | 3240612 | 1 |
21 | 0 | 8161109 | 1 |
22 | 153032 | 1981180 | 1 |
23 | 0 | 0 | 0 |
24 | 11057335 | 14212432 | 1 |
'''
print len(governance[governance['TemporarilyRstrNetAssetsGrp'].isnull()])
#, 'TemporarilyRestrictedNetAssets',
#'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
#'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY',
'''
842
#governance[267:270]
'''
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:].iterrows():
if pd.notnull(row['TemporarilyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['TemporarilyRstrNetAssetsGrp']:
temp_rest = row['TemporarilyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['TemporarilyRestrictedNetAssets']):
print 'going to second'
if 'EOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['EOY']
elif 'BOY' in row['TemporarilyRestrictedNetAssets']:
temp_rest = row['TemporarilyRestrictedNetAssets']['BOY']
governance.ix[index, 'temp_rest_assets'] = temp_rest
clear_output()
print ('Index: ', index, 'Temp. Restr Assets:', temp_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print governance['temp_rest_assets'].value_counts().sum()
'''
('Index: ', 7132, 'Temp. Restr Assets:', u'52310') # of minutes: 0.214020017783
7133
'''
print len(governance[governance['temp_rest_assets'].notnull()])
print len(governance[governance['temp_rest_assets'].isnull()])
print governance['temp_rest_assets'].describe()
governance['temp_rest_assets'] = governance['temp_rest_assets'].astype('int')
print governance['temp_rest_assets'].describe()
'''
7133 0 count 7133 unique 6282 top 0 freq 195 Name: temp_rest_assets, dtype: object count 7.133000e+03 mean 7.934227e+06 std 4.203098e+07 min -1.417569e+06 25% 2.050260e+05 50% 8.326920e+05 75% 3.255075e+06 max 1.473751e+09 Name: temp_rest_assets, dtype: float64
#governance[:2]
'''
print len(governance[governance['PermanentlyRestrictedNetAssets'].notnull()])
print len(governance[governance['PermanentlyRestrictedNetAssets'].isnull()])
print len(governance[governance['PermanentlyRstrNetAssetsGrp'].notnull()])
print len(governance[governance['PermanentlyRstrNetAssetsGrp'].isnull()])
'''
3520 3613 3699 3434
'''
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:].iterrows():
if pd.notnull(row['PermanentlyRstrNetAssetsGrp']):
#print 'not null!'
if 'EOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['EOYAmt']
elif 'BOYAmt' in row['PermanentlyRstrNetAssetsGrp']:
perm_rest = row['PermanentlyRstrNetAssetsGrp']['BOYAmt']
elif pd.notnull(row['PermanentlyRestrictedNetAssets']):
print 'going to second variable'
if 'EOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['EOY']
elif 'BOY' in row['PermanentlyRestrictedNetAssets']:
perm_rest = row['PermanentlyRestrictedNetAssets']['BOY']
governance.ix[index, 'perm_rest_assets'] = perm_rest
clear_output()
print ('Index: ', index, 'Perm. Restr Assets:', perm_rest), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print governance['perm_rest_assets'].value_counts().sum()
'''
('Index: ', 7132, 'Perm. Restr Assets:', u'284363') # of minutes: 0.194717868169 7133
'''
print len(governance[governance['perm_rest_assets'].notnull()])
print len(governance[governance['perm_rest_assets'].isnull()])
print governance['perm_rest_assets'].describe(), '\n'
governance['perm_rest_assets'] = governance['perm_rest_assets'].astype('int')
print governance['perm_rest_assets'].describe()
'''
7133 0 count 7133 unique 3215 top 0 freq 887 Name: perm_rest_assets, dtype: object count 7.133000e+03 mean 7.831133e+06 std 3.922188e+07 min 0.000000e+00 25% 6.430000e+04 50% 5.732270e+05 75% 3.064642e+06 max 1.097021e+09 Name: perm_rest_assets, dtype: float64
'''
governance['net_assets'] = governance['NetAssetsOrFundBalancesEOYAmt']
governance['net_assets'] = np.where(governance['net_assets'].isnull(),
governance['NetAssetsOrFundBalancesEOY'], governance['net_assets'])
print governance['net_assets'].describe(), '\n'
governance['net_assets'] = governance['net_assets'].astype('int')
print governance['net_assets'].describe()
'''
count 7133 unique 7120 top 0 freq 14 Name: net_assets, dtype: object count 7.133000e+03 mean 3.140955e+07 std 1.753475e+08 min -1.399548e+08 25% 1.766750e+06 50% 5.157328e+06 75% 1.582864e+07 max 7.982614e+09 Name: net_assets, dtype: float64
'''
governance['donor_restrictions'] = (governance['temp_rest_assets']+governance['perm_rest_assets'])/governance['net_assets']
governance['donor_restrictions'].describe()
'''
count 7.131000e+03 mean inf std NaN min -4.987176e+05 25% 1.567595e-01 50% 4.216044e-01 75% 8.305730e-01 max inf Name: donor_restrictions, dtype: float64
'''
governance['donor_restrictions'] = np.where(governance['donor_restrictions'].isnull(), 0, governance['donor_restrictions'])
governance['donor_restrictions'].describe()
'''
count 7.133000e+03 mean inf std NaN min -4.987176e+05 25% 1.567123e-01 50% 4.216012e-01 75% 8.303938e-01 max inf Name: donor_restrictions, dtype: float64
'''
df_all[df_all['EIN']=='010287624'][['FYE', 'TemporarilyRstrNetAssetsGrp', 'TemporarilyRestrictedNetAssets',
'PermanentlyRstrNetAssetsGrp', 'PermanentlyRestrictedNetAssets',
'UnrestrictedNetAssetsGrp', 'UnrestrictedNetAssets',
'TotalNetAssetsFundBalanceGrp', 'TotalNetAssetsFundBalances',
'NetAssetsOrFundBalancesBOYAmt', 'NetAssetsOrFundBalancesBOY',
'NetAssetsOrFundBalancesEOYAmt', 'NetAssetsOrFundBalancesEOY']]
'''
FYE | TemporarilyRstrNetAssetsGrp | TemporarilyRestrictedNetAssets | PermanentlyRstrNetAssetsGrp | PermanentlyRestrictedNetAssets | UnrestrictedNetAssetsGrp | UnrestrictedNetAssets | TotalNetAssetsFundBalanceGrp | TotalNetAssetsFundBalances | NetAssetsOrFundBalancesBOYAmt | NetAssetsOrFundBalancesBOY | NetAssetsOrFundBalancesEOYAmt | NetAssetsOrFundBalancesEOY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
19137 | FY2014 | {u'BOYAmt': u'763419', u'EOYAmt': u'718875'} | NaN | NaN | NaN | {u'BOYAmt': u'1114442', u'EOYAmt': u'1112268'} | NaN | {u'BOYAmt': u'1877861', u'EOYAmt': u'1831143'} | NaN | 1877861 | NaN | 1831143 | NaN |
4153 | FY2013 | {u'BOYAmt': u'568056', u'EOYAmt': u'763419'} | NaN | NaN | NaN | {u'BOYAmt': u'969792', u'EOYAmt': u'1114442'} | NaN | {u'BOYAmt': u'1537848', u'EOYAmt': u'1877861'} | NaN | 1537848 | NaN | 1877861 | NaN |
4696 | FY2012 | NaN | {u'BOY': u'489517', u'EOY': u'568056'} | NaN | NaN | NaN | {u'BOY': u'1227268', u'EOY': u'969792'} | NaN | {u'BOY': u'1716785', u'EOY': u'1537848'} | NaN | 1716785 | NaN | 1537848 |
22671 | FY2011 | NaN | {u'BOY': u'472102', u'EOY': u'489517'} | NaN | NaN | NaN | {u'BOY': u'1451414', u'EOY': u'1227268'} | NaN | {u'BOY': u'1923516', u'EOY': u'1716785'} | NaN | 1923516 | NaN | 1716785 |
#(718875+0)/(718875+1112268)
0.39258266558100596
#print (718875+1112268)-1831143
#print 1831143 - (718875+1112268)
0 -1
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:].iterrows():
bond = 0
if pd.notnull(row['TaxExemptBondsInd']):
#print 'not null!'
if '#text' in row['TaxExemptBondsInd']:
bond = row['TaxExemptBondsInd']['#text']
else:
bond = row['TaxExemptBondsInd']
elif pd.notnull(row['TaxExemptBonds']):
if '#text' in row['TaxExemptBonds']:
bond = row['TaxExemptBonds']['#text']
else:
bond = row['TaxExemptBonds']
governance.ix[index, 'tax_exempt_bond'] = bond
clear_output()
print ('Index: ', index, 'Tax-exempt bond:', bond), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
print governance['tax_exempt_bond'].value_counts().sum()
('Index: ', 7132, 'Tax-exempt bond:', u'0') # of minutes: 0.197681299845 7133
print governance['tax_exempt_bond'].value_counts(), '\n'
governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond']=='true', 1, governance['tax_exempt_bond'])
governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond']=='false', 0, governance['tax_exempt_bond'])
governance['tax_exempt_bond'] = np.where(governance['tax_exempt_bond'].isnull(), 0, governance['tax_exempt_bond'])
governance['tax_exempt_bond'] = governance['tax_exempt_bond'].astype('int')
print governance['tax_exempt_bond'].value_counts()
0 4251 false 2580 1 206 true 96 Name: tax_exempt_bond, dtype: int64 0 6831 1 302 Name: tax_exempt_bond, dtype: int64
NO NEED TO DO THIS -- MY VARIABLE ALREADY TAPS IT
governance[['TaxExemptBondLiabilitiesGrp', 'TaxExemptBondLiabilities', 'tax_exempt_bond']][2:4]
TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | tax_exempt_bond | |
---|---|---|---|
2 | {u'BOYAmt': u'106014868', u'EOYAmt': u'104647760'} | {u'BOY': u'77905017', u'EOY': u'109371484'} | 1 |
3 | NaN | NaN | 0 |
AS IN YETMAN AND YETMAN'S TAXABLE REVENUE VARIABLE -- A REGULATORY OVERSIGHT VARIABLE, GIVEN THAT IT IS RELATED TO AUDIT LIKELIHOOD.
governance['taxable_revenue'] = governance['NetUnrelatedBusTxblIncmAmt']
governance['taxable_revenue'] = np.where(governance['taxable_revenue'].isnull(),
governance['NetUnrelatedBusinessTxblIncome'], governance['taxable_revenue'])
print len(governance[governance['taxable_revenue'].isnull()])
print governance['taxable_revenue'].describe(), '\n'
governance['taxable_revenue'] = np.where(governance['taxable_revenue'].isnull(), 0, governance['taxable_revenue'])
print governance['taxable_revenue'].describe(), '\n'
governance['taxable_revenue'] = governance['taxable_revenue'].astype('int')
print governance['taxable_revenue'].describe(), '\n'
1743 count 5390 unique 971 top 0 freq 4406 Name: taxable_revenue, dtype: object count 7133 unique 972 top 0 freq 4406 Name: taxable_revenue, dtype: object count 7133.00 mean -7272.61 std 156439.97 min -8633000.00 25% 0.00 50% 0.00 75% 0.00 max 2376271.00 Name: taxable_revenue, dtype: float64
CREATE TWO BINARY VERSION -- ONE FOR TAXABLE REVENUE GREATER THAN ZERO AND ONE FOR TAXABLE REVENUE OTHER THAN ZERO
print len(governance[governance['taxable_revenue']>0])
print len(governance[governance['taxable_revenue']==0])
print len(governance[governance['taxable_revenue']<0])
governance['taxable_revenue_binary'] = np.where(governance['taxable_revenue']>0, 1,0)
print governance['taxable_revenue_binary'].value_counts(), '\n'
governance['taxable_revenue_neg_or_pos_rev_binary'] = np.where( ((governance['taxable_revenue']>0) |
(governance['taxable_revenue']<0)), 1,0)
print governance['taxable_revenue_neg_or_pos_rev_binary'].value_counts()
361 6149 623 0 6772 1 361 Name: taxable_revenue_binary, dtype: int64 0 6149 1 984 Name: taxable_revenue_neg_or_pos_rev_binary, dtype: int64
governance[:1]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | independent_directors_num | voting_directors | independent_directors_pct | independent_directors_pct_v2 | outsourced_mgt | 990_review | audit_committee | restricted_donations | tax_exempt_bond | taxable_revenue | taxable_revenue_binary | taxable_revenue_neg_or_pos_rev_binary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 010202467 | FY2014 | 22 | 20 | 21 | 20 | 1 | 1 | 1 | 0 | 1 | 1 | 4433997 | 7351259 | 23690097 | 0.50 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 0 | 1 | NaN | NaN | 7932282 | 7295185 | 1 | 1 | 21 | 22 | 0.95 | 0.95 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
governance['audited_financials'] = np.nan
print len(governance[governance['audited_financials'].notnull()])
governance['audited_financials'] = np.where(governance['FSAuditedInd'].notnull(), governance['FSAuditedInd'],
governance['audited_financials'] )
print len(governance[governance['audited_financials'].notnull()]), '\n'
governance['audited_financials'] = np.where( ((governance['audited_financials'].isnull()) &
(governance['FSAudited'].notnull())),
governance['FSAudited'], governance['audited_financials'] )
print len(governance[governance['audited_financials'].notnull()]), '\n'
governance['audited_financials'].value_counts()
0 6914 7133
1 4254 true 2454 false 222 0 203 Name: audited_financials, dtype: int64
governance['audited_financials'] = np.where( governance['audited_financials']=='true', 1, governance['audited_financials'] )
governance['audited_financials'] = np.where( governance['audited_financials']=='1', 1, governance['audited_financials'] )
governance['audited_financials'] = np.where( governance['audited_financials']=='false', 0, governance['audited_financials'] )
governance['audited_financials'] = np.where( governance['audited_financials']=='0', 0, governance['audited_financials'] )
print len(governance[governance['audited_financials'].notnull()])
governance['audited_financials'].value_counts()
7133
1 6708 0 425 Name: audited_financials, dtype: int64
governance['federal_grant_audit_performed'] = governance['FederalGrantAuditPerformedInd']
print len(governance[governance['federal_grant_audit_performed'].isnull()])
governance['federal_grant_audit_performed'] = np.where(governance['federal_grant_audit_performed'].isnull(),
governance['FederalGrantAuditPerformed'], governance['federal_grant_audit_performed'])
print len(governance[governance['federal_grant_audit_performed'].isnull()])
print governance['federal_grant_audit_performed'].value_counts()
5974 5061 1 931 false 646 true 490 0 5 Name: federal_grant_audit_performed, dtype: int64
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='true', 1, governance['federal_grant_audit_performed'] )
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='1', 1, governance['federal_grant_audit_performed'] )
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='false', 0, governance['federal_grant_audit_performed'] )
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed']=='0', 0, governance['federal_grant_audit_performed'] )
print len(governance[governance['federal_grant_audit_performed'].notnull()])
governance['federal_grant_audit_performed'].value_counts()
2072
1 1421 0 651 Name: federal_grant_audit_performed, dtype: int64
governance['federal_grant_audit_performed'] = np.where( governance['federal_grant_audit_performed'].isnull(),
0, governance['federal_grant_audit_performed'])
governance['federal_grant_audit_performed'].value_counts()
0 5712 1 1421 Name: federal_grant_audit_performed, dtype: int64
governance['federal_grant_audit_required'] = governance['FederalGrantAuditRequiredInd']
print len(governance[governance['federal_grant_audit_required'].isnull()])
governance['federal_grant_audit_required'] = np.where(governance['federal_grant_audit_required'].isnull(),
governance['FederalGrantAuditRequired'], governance['federal_grant_audit_required'])
print len(governance[governance['federal_grant_audit_required'].isnull()]), '\n'
print governance['federal_grant_audit_required'].value_counts(), '\n'
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required'].isnull(),
0, governance['federal_grant_audit_required'])
governance['federal_grant_audit_required'].value_counts()
288 45 0 3746 false 2214 1 719 true 409 Name: federal_grant_audit_required, dtype: int64
0 3746 false 2214 1 719 true 409 0 45 Name: federal_grant_audit_required, dtype: int64
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='true', 1, governance['federal_grant_audit_required'] )
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='1', 1, governance['federal_grant_audit_required'] )
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='false', 0, governance['federal_grant_audit_required'] )
governance['federal_grant_audit_required'] = np.where( governance['federal_grant_audit_required']=='0', 0, governance['federal_grant_audit_required'] )
print len(governance[governance['federal_grant_audit_required'].notnull()])
governance['federal_grant_audit_required'].value_counts()
7133
0 6005 1 1128 Name: federal_grant_audit_required, dtype: int64
governance['independent_audited_fs'] = governance['IndependentAuditFinclStmtInd']
governance['independent_audited_fs'] = np.where(governance['independent_audited_fs'].isnull(),
governance['IndependentAuditFinancialStmt'], governance['independent_audited_fs'])
print len(governance[governance['independent_audited_fs'].isnull()])
print governance['independent_audited_fs'].value_counts()
0
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-512-199086f70e01> in <module>() 3 governance['IndependentAuditFinancialStmt'], governance['independent_audited_fs']) 4 print len(governance[governance['independent_audited_fs'].isnull()]) ----> 5 print governance['independent_audited_fs'].value_counts() //anaconda/lib/python2.7/site-packages/pandas/core/base.pyc in value_counts(self, normalize, sort, ascending, bins, dropna) 949 from pandas.core.algorithms import value_counts 950 result = value_counts(self, sort=sort, ascending=ascending, --> 951 normalize=normalize, bins=bins, dropna=dropna) 952 return result 953 //anaconda/lib/python2.7/site-packages/pandas/core/algorithms.pyc in value_counts(values, sort, ascending, normalize, bins, dropna) 376 else: 377 # ndarray path. pass original to handle DatetimeTzBlock --> 378 keys, counts = _value_counts_arraylike(values, dropna=dropna) 379 380 from pandas import Index, Series //anaconda/lib/python2.7/site-packages/pandas/core/algorithms.pyc in _value_counts_arraylike(values, dropna) 444 values = _ensure_object(values) 445 mask = isnull(values) --> 446 keys, counts = htable.value_count_object(values, mask) 447 if not dropna and mask.any(): 448 keys = np.insert(keys, 0, np.NaN) pandas/hashtable.pyx in pandas.hashtable.value_count_object (pandas/hashtable.c:21313)() pandas/hashtable.pyx in pandas.hashtable.value_count_object (pandas/hashtable.c:21048)() TypeError: unhashable type: 'dict'
len(governance[governance['independent_audited_fs'].isnull()])
0
GIVEN ABOVE ERROR, I WILL HAVE TO EXTRACT THE RIGHT SUB-VARIABLE FROM THE COLUMN
from IPython.display import display, clear_output ##### FOR USE WITH STDOUT (DYNAMIC, SINGLE-LINE PRINTING)
import timeit
start_time = timeit.default_timer()
for index, row in governance[:10].iterrows():
#bond = 0
if type(row['independent_audited_fs'])==dict and '#text' in row['independent_audited_fs']:
#print 'looking'
governance.ix[index, 'independent_audited_fs'] = row['independent_audited_fs']['#text']
clear_output()
print ('Index: ', index, 'Independent audited F/S:', governance.ix[index, 'independent_audited_fs']), '\n'
sys.stdout.flush()
elapsed = timeit.default_timer() - start_time
print '# of minutes: ', elapsed/60, '\n', '\n'
('Index: ', 9, 'Independent audited F/S:', u'1') # of minutes: 0.000384934743245
governance[:1]
EIN | FYE | VotingMembersGoverningBodyCnt | NbrVotingMembersGoverningBody | VotingMembersIndependentCnt | NbrIndependentVotingMembers | DelegationOfMgmtDutiesInd | DelegationOfManagementDuties | Form990ProvidedToGvrnBodyInd | Form990ProvidedToGoverningBody | AuditCommitteeInd | AuditCommittee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | TaxExemptBondsInd | TaxExemptBonds | NetUnrelatedBusTxblIncmAmt | NetUnrelatedBusinessTxblIncome | FSAuditedInd | FSAudited | FederalGrantAuditPerformedInd | FederalGrantAuditPerformed | FederalGrantAuditRequiredInd | FederalGrantAuditRequired | IndependentAuditFinclStmtInd | IndependentAuditFinancialStmt | FeesForServicesAccounting_binary | FamilyOrBusinessRlnInd | FamilyOrBusinessRelationship | TaxExemptBondLiabilitiesGrp | TaxExemptBondLiabilities | GovernmentGrantsAmt | GovernmentGrants | ElectionOfBoardMembersInd | ElectionOfBoardMembers | independent_directors_num | voting_directors | independent_directors_pct | independent_directors_pct_v2 | outsourced_mgt | 990_review | audit_committee | restricted_donations | tax_exempt_bond | taxable_revenue | taxable_revenue_binary | taxable_revenue_neg_or_pos_rev_binary | audited_financials | federal_grant_audit_performed | federal_grant_audit_required | independent_audited_fs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 010202467 | FY2014 | 22 | 20 | 21 | 20 | 1 | 1 | 1 | 0 | 1 | 1 | 4433997 | 7351259 | 23690097 | 0.50 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1053100001'} | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | {u'#text': u'1', u'@referenceDocumentId': u'RetDoc1040000001'} | 1 | 0 | 1 | NaN | NaN | 7932282 | 7295185 | 1 | 1 | 21 | 22 | 0.95 | 0.95 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 |
governance['independent_audited_fs'].value_counts()
1 3503 true 2050 0 954 false 626 Name: independent_audited_fs, dtype: int64
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='true', 1, governance['independent_audited_fs'] )
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='1', 1, governance['independent_audited_fs'] )
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='false', 0, governance['independent_audited_fs'] )
governance['independent_audited_fs'] = np.where( governance['independent_audited_fs']=='0', 0, governance['independent_audited_fs'] )
print len(governance[governance['independent_audited_fs'].notnull()])
governance['independent_audited_fs'].value_counts()
7133
1 5553 0 1580 Name: independent_audited_fs, dtype: int64
governance['no_relations'] = governance['FamilyOrBusinessRlnInd']
governance['no_relations'] = np.where(governance['no_relations'].isnull(),
governance['FamilyOrBusinessRelationship'], governance['no_relations'])
print len(governance[governance['no_relations'].isnull()])
print governance['no_relations'].value_counts()
0 0 3461 false 2092 1 996 true 584 Name: no_relations, dtype: int64
governance['no_relations'] = np.where( governance['no_relations']=='true', 1, governance['no_relations'] )
governance['no_relations'] = np.where( governance['no_relations']=='1', 1, governance['no_relations'] )
governance['no_relations'] = np.where( governance['no_relations']=='false', 0, governance['no_relations'] )
governance['no_relations'] = np.where( governance['no_relations']=='0', 0, governance['no_relations'] )
print len(governance[governance['no_relations'].notnull()])
governance['no_relations'].value_counts()
7133
0 5553 1 1580 Name: no_relations, dtype: int64
governance['elected_board'] = governance['ElectionOfBoardMembersInd']
governance['elected_board'] = np.where(governance['elected_board'].isnull(),
governance['ElectionOfBoardMembers'], governance['elected_board'])
print len(governance[governance['elected_board'].isnull()])
print governance['elected_board'].value_counts()
0 0 3843 false 2342 1 614 true 334 Name: elected_board, dtype: int64
governance['elected_board'] = np.where( governance['elected_board']=='true', 1, governance['elected_board'] )
governance['elected_board'] = np.where( governance['elected_board']=='1', 1, governance['elected_board'] )
governance['elected_board'] = np.where( governance['elected_board']=='false', 0, governance['elected_board'] )
governance['elected_board'] = np.where( governance['elected_board']=='0', 0, governance['elected_board'] )
print len(governance[governance['elected_board'].notnull()])
governance['elected_board'].value_counts()
7133
0 6185 1 948 Name: elected_board, dtype: int64
governance[['GovernmentGrantsAmt', ]]
governance['government_grant'] = governance['GovernmentGrantsAmt']
governance['government_grant'] = np.where(governance['government_grant'].isnull(),
governance['GovernmentGrants'], governance['government_grant'])
print len(governance[governance['government_grant'].isnull()])
print governance['government_grant'].describe(), '\n'
governance['government_grant'] = np.where(governance['government_grant'].isnull(), 0,governance['government_grant'] )
print len(governance[governance['government_grant'].isnull()])
print governance['government_grant'].describe(), '\n'
governance['government_grant'] = np.where(governance['government_grant']>0, 1,0)
print len(governance[governance['government_grant'].isnull()])
print governance['government_grant'].value_counts(), '\n'
3327 count 3806 unique 3559 top 0 freq 86 Name: government_grant, dtype: object 0 count 7133 unique 3560 top 0 freq 3327 Name: government_grant, dtype: int64 0 1 3806 0 3327 Name: government_grant, dtype: int64
governance2['zero_FR'] = np.nan
print len(governance2[governance2['zero_FR'].notnull()]), '\n'
governance2['zero_FR'] = np.where(governance2['TotalFundrsngExpCurrentYear'].notnull(),
governance2['TotalFundrsngExpCurrentYear'],
governance2['zero_FR'] )
print len(governance2[governance2['zero_FR'].notnull()]), '\n'
governance2['zero_FR'] = np.where( ((governance2['zero_FR'].isnull()) &
(governance2['CYTotalFundraisingExpenseAmt'].notnull())),
governance2['CYTotalFundraisingExpenseAmt'], governance2['zero_FR'] )
print len(governance2[governance2['zero_FR'].notnull()])
governance2['zero_FR'].value_counts()[:5]
0 6434 7133
0 86 140183 2 221765 2 135672 2 121804 2 Name: zero_FR, dtype: int64
governance2['zero_FR'] = np.where(governance2['zero_FR']=='0', 1,0)
governance2['zero_FR'].value_counts()
0 7047 1 86 Name: zero_FR, dtype: int64
governance.to_pickle('governance variables including original columns (n=7,133).pkl')
print governance.columns.tolist()
['EIN', 'FYE', u'VotingMembersGoverningBodyCnt', u'NbrVotingMembersGoverningBody', u'VotingMembersIndependentCnt', u'NbrIndependentVotingMembers', u'DelegationOfMgmtDutiesInd', u'DelegationOfManagementDuties', u'Form990ProvidedToGvrnBodyInd', u'Form990ProvidedToGoverningBody', u'AuditCommitteeInd', u'AuditCommittee', 'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions', u'TaxExemptBondsInd', u'TaxExemptBonds', u'NetUnrelatedBusTxblIncmAmt', u'NetUnrelatedBusinessTxblIncome', u'FSAuditedInd', u'FSAudited', u'FederalGrantAuditPerformedInd', u'FederalGrantAuditPerformed', u'FederalGrantAuditRequiredInd', u'FederalGrantAuditRequired', u'IndependentAuditFinclStmtInd', u'IndependentAuditFinancialStmt', 'FeesForServicesAccounting_binary', u'FamilyOrBusinessRlnInd', u'FamilyOrBusinessRelationship', u'TaxExemptBondLiabilitiesGrp', u'TaxExemptBondLiabilities', u'GovernmentGrantsAmt', u'GovernmentGrants', u'ElectionOfBoardMembersInd', u'ElectionOfBoardMembers', 'independent_directors_num', 'voting_directors', 'independent_directors_pct', 'independent_directors_pct_v2', 'outsourced_mgt', '990_review', 'audit_committee', 'restricted_donations', 'tax_exempt_bond', 'taxable_revenue', 'taxable_revenue_binary', 'taxable_revenue_neg_or_pos_rev_binary', 'audited_financials', 'federal_grant_audit_performed', 'federal_grant_audit_required', 'independent_audited_fs', 'no_relations', 'elected_board', 'government_grant']
governance_cols = ['EIN', 'FYE',
'independent_directors_num', 'independent_directors_pct', 'independent_directors_pct_v2',
'voting_directors',
'outsourced_mgt', '990_review', 'audit_committee',
'perm_rest_assets', 'temp_rest_assets', 'net_assets', 'donor_restrictions',
'restricted_donations',
'tax_exempt_bond', 'taxable_revenue', 'taxable_revenue_binary',
'taxable_revenue_neg_or_pos_rev_binary',
'audited_financials', 'independent_audited_fs',
'federal_grant_audit_performed', 'federal_grant_audit_required',
'FeesForServicesAccounting_binary',
'no_relations',
'elected_board',
'government_grant',
]
df = pd.read_pickle('2016 - Test 4 data.pkl')
print "Number of columns:", len(df.columns)
print "Number of observations:", len(df)
df.head(1)
Number of columns: 35 Number of observations: 8304
donor_advisory | donor_advisory_2016 | donor_advisory_2011_to_2016 | org_id | EIN | FYE | Form 990 FYE | ratings_system | 2011_data | 2016_data | conflict_of_interest_policy_v2 | records_retention_policy_v2 | whistleblower_policy_v2 | SOX_policies | SOX_policies_binary | SOX_policies_all_binary | program_efficiency | complexity | complexity_2011 | age | total_revenue_logged | category | state | tot_rev | category_Animals | category_Arts, Culture, Humanities | category_Community Development | category_Education | category_Environment | category_Health | category_Human Services | category_Human and Civil Rights | category_International | category_Religion | category_Research and Public Policy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
50709 | 0.0 | 0.0 | 0.0 | 5954 | 010202467 | FY2014 | 2014-12 | CN 2.1 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.0 | 0.794457 | 6.0 | NaN | 62.0 | 16.377993 | Research and Public Policy | ME | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
df_test4 = pd.read_stata('Test 4 data.dta')
print "Number of columns:", len(df_test4.columns)
print "Number of observations:", len(df_test4)
df_test4.head(1)
Number of columns: 39 Number of observations: 8238
Col1 | A2011_data | A2016_data | ein | fye | Form_990_FYE | SOX_policies | SOX_policies_all_binary | SOX_policies_binary | age | category | category_Animals | category_Arts__Culture__Humaniti | category_Community_Development | category_Education | category_Environment | category_Health | category_Human_Services | category_Human_and_Civil_Rights | category_International | category_Religion | category_Research_and_Public_Pol | complexity | complexity_2011 | conflict_of_interest_policy_v2 | donor_advisory | donor_advisory_2011_to_2016 | donor_advisory_2016 | org_id | program_efficiency | ratings_system | records_retention_policy_v2 | state | tot_rev | total_revenue_logged | whistleblower_policy_v2 | ncategory | revs | samused_ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50709 | 0 | 1 | 010202467 | FY2014 | 2014-12 | 3.00 | 1.00 | 1.00 | 62.00 | Research and Public Policy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.00 | nan | 1.00 | 0 | 0 | 0 | 5954 | 0.79 | CN 2.1 | 1.00 | ME | nan | 16.38 | 1.00 | Research and Public Policy | 12967968.00 | 1.00 |
print set(df.columns.tolist()) - set(df_test4.columns.tolist()), '\n'
print set(df_test4.columns.tolist()) - set(df.columns.tolist())
set(['Form 990 FYE', 'category_Human Services', 'category_Community Development', 'FYE', '2016_data', 'category_Arts, Culture, Humanities', 'category_Research and Public Policy', 'category_Human and Civil Rights', '2011_data', 'EIN']) set(['Form_990_FYE', 'samused_', 'category_Research_and_Public_Pol', 'category_Human_Services', 'A2011_data', 'fye', 'category_Human_and_Civil_Rights', 'Col1', 'category_Community_Development', 'A2016_data', 'revs', 'ein', 'category_Arts__Culture__Humaniti', 'ncategory'])
#print len(df.columns)
#print len(pd.merge(df, governance[governance_cols], left_on='EIN', right_on='EIN', how='left', indicator=True)), '\n'
#merged = pd.merge(df, governance[governance_cols], left_on='EIN', right_on='EIN', how='left', indicator=True)
#print len(merged.columns)
#print len(merged), '\n'
#print merged['_merge'].value_counts()
35 8304 56 8304 both 7133 left_only 1171 right_only 0 Name: _merge, dtype: int64
print len(df.columns)
print len(pd.merge(df_test4, governance[governance_cols], left_on='ein', right_on='EIN', how='left', indicator=True)), '\n'
merged = pd.merge(df_test4, governance[governance_cols], left_on='ein', right_on='EIN', how='left', indicator=True)
print len(merged.columns)
print len(merged), '\n'
print merged['_merge'].value_counts()
35 8238 66 8238 both 7133 left_only 1105 right_only 0 Name: _merge, dtype: int64
#merged = merged.drop('_merge', 1)
'''
print len(merged.columns)
print len(pd.merge(merged, governance2[['EIN', 'zero_FR']], left_on='ein', right_on='EIN', how='left', indicator=True)), '\n'
merged = pd.merge(merged, governance2[['EIN', 'zero_FR']], left_on='ein', right_on='EIN', how='left', indicator=True)
print len(merged.columns)
print len(merged), '\n'
print merged['_merge'].value_counts()
'''
65 8238 68 8238 both 7133 left_only 1105 right_only 0 Name: _merge, dtype: int64
merged[:1]
Col1 | A2011_data | A2016_data | ein | fye | Form_990_FYE | SOX_policies | SOX_policies_all_binary | SOX_policies_binary | age | category | category_Animals | category_Arts__Culture__Humaniti | category_Community_Development | category_Education | category_Environment | category_Health | category_Human_Services | category_Human_and_Civil_Rights | category_International | category_Religion | category_Research_and_Public_Pol | complexity | complexity_2011 | conflict_of_interest_policy_v2 | donor_advisory | donor_advisory_2011_to_2016 | donor_advisory_2016 | org_id | program_efficiency | ratings_system | records_retention_policy_v2 | state | tot_rev | total_revenue_logged | whistleblower_policy_v2 | ncategory | revs | samused_ | EIN_x | FYE | independent_directors_num | independent_directors_pct | independent_directors_pct_v2 | voting_directors | outsourced_mgt | 990_review | audit_committee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | restricted_donations | tax_exempt_bond | taxable_revenue | taxable_revenue_binary | taxable_revenue_neg_or_pos_rev_binary | audited_financials | independent_audited_fs | federal_grant_audit_performed | federal_grant_audit_required | FeesForServicesAccounting_binary | no_relations | elected_board | government_grant | EIN_y | zero_FR | _merge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50709 | 0 | 1 | 010202467 | FY2014 | 2014-12 | 3.00 | 1.00 | 1.00 | 62.00 | Research and Public Policy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.00 | nan | 1.00 | 0 | 0 | 0 | 5954 | 0.79 | CN 2.1 | 1.00 | ME | nan | 16.38 | 1.00 | Research and Public Policy | 12967968.00 | 1.00 | 010202467 | FY2014 | 21.00 | 0.95 | 0.95 | 22.00 | 1.00 | 1.00 | 1.00 | 4433997.00 | 7351259.00 | 23690097.00 | 0.50 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 1 | 1 | 1 | 1 | 1.00 | 0 | 1 | 1.00 | 010202467 | 0.00 | both |
#merged.rename(columns={'FYE_x':'FYE'}, inplace=True)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
merged[:1]
Col1 | A2011_data | A2016_data | ein | fye | Form_990_FYE | SOX_policies | SOX_policies_all_binary | SOX_policies_binary | age | category | category_Animals | category_Arts__Culture__Humaniti | category_Community_Development | category_Education | category_Environment | category_Health | category_Human_Services | category_Human_and_Civil_Rights | category_International | category_Religion | category_Research_and_Public_Pol | complexity | complexity_2011 | conflict_of_interest_policy_v2 | donor_advisory | donor_advisory_2011_to_2016 | donor_advisory_2016 | org_id | program_efficiency | ratings_system | records_retention_policy_v2 | state | tot_rev | total_revenue_logged | whistleblower_policy_v2 | ncategory | revs | EIN | FYE | independent_directors_num | independent_directors_pct | voting_directors | outsourced_mgt | 990_review | audit_committee | perm_rest_assets | temp_rest_assets | net_assets | donor_restrictions | tax_exempt_bond | taxable_revenue | taxable_revenue_binary | taxable_revenue_neg_or_pos_rev_binary | audited_financials | independent_audited_fs | federal_grant_audit_performed | federal_grant_audit_required | FeesForServicesAccounting_binary | _merge | independent_directors_pct_v2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50709 | 0 | 1 | 010202467 | FY2014 | 2014-12 | 3.00 | 1.00 | 1.00 | 62.00 | Research and Public Policy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.00 | nan | 1.00 | 0 | 0 | 0 | 5954 | 0.79 | CN 2.1 | 1.00 | ME | nan | 16.38 | 1.00 | Research and Public Policy | 12967968.00 | 010202467 | FY2014 | 21.00 | 0.95 | 22.00 | 1.00 | 1.00 | 1.00 | 4433997.00 | 7351259.00 | 23690097.00 | 0.50 | 1.00 | 0.00 | 0.00 | 0.00 | 1 | 1 | 1 | 1 | 1.00 | both | 0.95 |
merged.to_pickle('model 4 data with e-file governance variables (n=8,238).pkl')
merged.to_excel('model 4 data with e-file governance variables (n=8,238).xls')