import sqlite3
import pandas as pd
import numpy as np
size=[]
chunk_iter= pd.read_csv("crunchbase-investments.csv",encoding='latin',chunksize=5000)
for chunk in chunk_iter:
size.append(chunk.memory_usage(deep=True).sum()/(1024**2))
print("avg size of each block",sum(size)/(len(size)))
avg size of each block 5.180771307511763
#Each column's missing value counts
missing_values=[]
data_len=0
columns_memory=pd.Series()
counter=0
chunk_iter= pd.read_csv("crunchbase-investments.csv",encoding='latin',chunksize=5000)
for chunk in chunk_iter:
missing_values.append(chunk.apply(pd.isnull).sum())
data_len+=chunk.shape[0]
if counter==0:
columns_memory=chunk.memory_usage(deep=True)/(1024**2)
else:
columns_memory+=chunk.memory_usage(deep=True)/(1024**2)
counter=1
combined_missing_value=pd.concat(missing_values)
print("No of rows in data:",data_len)
print("size of data :","{:.2f} mb".format(sum(size)))
print(combined_missing_value.groupby(combined_missing_value.index).sum().sort_values())
No of rows in data: 52870 size of data : 56.99 mb company_country_code 1 company_name 1 company_permalink 1 company_region 1 investor_region 2 investor_permalink 2 investor_name 2 funded_quarter 3 funded_at 3 funded_month 3 funded_year 3 funding_round_type 3 company_state_code 492 company_city 533 company_category_code 643 raised_amount_usd 3599 investor_country_code 12001 investor_city 12480 investor_state_code 16809 investor_category_code 50427 dtype: int64
columns_memory
Index 0.000877 company_permalink 3.869808 company_name 3.424955 company_category_code 3.262619 company_country_code 3.025223 company_state_code 2.962161 company_region 3.253541 company_city 3.343512 investor_permalink 4.749821 investor_name 3.734270 investor_category_code 0.593590 investor_country_code 2.524654 investor_state_code 2.361876 investor_region 3.238946 investor_city 2.751430 funding_round_type 3.252704 funded_at 3.378091 funded_month 3.226837 funded_quarter 3.226837 funded_year 0.403366 raised_amount_usd 0.403366 dtype: float64
# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']
keep_cols = chunk.columns.drop(drop_cols)
#identify type of each column
datatype={}
chunk_iter= pd.read_csv("crunchbase-investments.csv",encoding='latin',chunksize=5000)
for chunk in chunk_iter:
for cols in chunk.columns:
datatype[cols]=chunk[cols].dtype
# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']
keep_cols = chunk.columns.drop(drop_cols)
#identify type of each column
size=[]
conn = sqlite3.connect('crunchbase.db')
cur=conn.cursor()
columns_category={'company_category_code':'category',
'company_city':'category',
'company_country_code':'category',
'company_name':'category',
'company_permalink':'category',
'company_region':'category',
'company_state_code':'category',
'funding_round_type': 'category',
'investor_category_code': 'category',
'investor_city': 'category',
'investor_country_code':'category',
'investor_name':'category',
'investor_permalink':'category',
'investor_region':'category',
'investor_state_code':'category',
}
datatype={}
chunk_iter= pd.read_csv("crunchbase-investments.csv",
encoding='latin',chunksize=5000,
dtype=columns_category,
parse_dates=["funded_at"])
for chunk in chunk_iter:
chunk=chunk.dropna()
funded_month=chunk["funded_month"].str.split("-").str[-1]
funded_quarter=chunk["funded_quarter"].str.split("-").str[-1]
chunk["funded_month"]=pd.to_numeric(funded_month,downcast='signed')
chunk["funded_year"]=chunk["funded_year"].astype('int32')
chunk['funded_quarter']=funded_quarter
chunk["funded_quarter"]=chunk["funded_quarter"].astype("category")
size.append(chunk.memory_usage(deep=True).sum()/(1024**2))
chunk.to_sql('investments',conn,if_exists='append',index=False)
print("size of data {:.2f} mb".format(sum(size)))
size of data 9.63 mb
results_df = pd.read_sql('PRAGMA table_info(investments);', conn)
print(results_df)
cid name type notnull dflt_value pk 0 0 company_permalink TEXT 0 None 0 1 1 company_name TEXT 0 None 0 2 2 company_category_code TEXT 0 None 0 3 3 company_country_code TEXT 0 None 0 4 4 company_state_code TEXT 0 None 0 5 5 company_region TEXT 0 None 0 6 6 company_city TEXT 0 None 0 7 7 investor_permalink TEXT 0 None 0 8 8 investor_name TEXT 0 None 0 9 9 investor_category_code TEXT 0 None 0 10 10 investor_country_code TEXT 0 None 0 11 11 investor_state_code TEXT 0 None 0 12 12 investor_region TEXT 0 None 0 13 13 investor_city TEXT 0 None 0 14 14 funding_round_type TEXT 0 None 0 15 15 funded_at TEXT 0 None 0 16 16 funded_month TEXT 0 None 0 17 17 funded_quarter TEXT 0 None 0 18 18 funded_year INTEGER 0 None 0 19 19 raised_amount_usd REAL 0 None 0
#top 10% fund rasiser analysis
query="""
select iv.company_name,
cast(sum(raised_amount_usd) as double)/(select cast(sum(raised_amount_usd) as bigint) from investments) as percentage_funding,
cast(sum(raised_amount_usd) as bigint) as funding_amount
from investments as iv
group by iv.company_name
order by funding_amount desc
limit (select cast(count(distinct company_name)*.1 as int) from investments)
"""
top_10_raised=pd.read_sql(query,conn)
print("funding raised by top 10 percent %.2f billion dollars"%(top_10_raised["funding_amount"].sum()/10000000000))
funding raised by top 10 percent 58.35 billion dollars
#top 1% fund rasiser analysis
query="""
select iv.company_name,
cast(sum(raised_amount_usd) as double)/(select cast(sum(raised_amount_usd) as bigint) from investments) as percentage_funding,
cast(sum(raised_amount_usd) as bigint) as funding_amount
from investments as iv
group by iv.company_name
order by funding_amount desc
limit (select cast(count(distinct company_name)*.01 as int) from investments)
"""
top_1_raised=pd.read_sql(query,conn)
print("funding raised by top 1 percent %.2f billion dollars"%(top_1_raised["funding_amount"].sum()/10000000000))
funding raised by top 1 percent 27.38 billion dollars
#bottom 10% fund rasiser analysis
query="""
select iv.company_name,
round(cast(sum(raised_amount_usd) as double)/(select cast(sum(raised_amount_usd) as double) from investments),6) as percentage_funding,
cast(sum(raised_amount_usd) as bigint) as funding_amount
from investments as iv
group by iv.company_name
having funding_amount is not Null
order by funding_amount asc
limit (select cast(count(distinct company_name)*.1 as int) from investments)
"""
btm_10_raised=pd.read_sql(query,conn)
print("funding raised by btm 10 percent %.10f billion dollars"%(btm_10_raised["funding_amount"].sum()/10000000000))
funding raised by btm 10 percent 0.0294283333 billion dollars
#btm 1% percent funding
query="""
select iv.company_name,
round(cast(sum(raised_amount_usd) as double)/(select cast(sum(raised_amount_usd) as double) from investments),6) as percentage_funding,
cast(sum(raised_amount_usd) as bigint) as funding_amount
from investments as iv
group by iv.company_name
having funding_amount is not Null
order by funding_amount asc
limit (select cast(count(distinct company_name)*.01 as int) from investments)
"""
btm_1_raised=pd.read_sql(query,conn)
print("funding raised by btm 1 percent %.10f billion dollars"%(btm_1_raised["funding_amount"].sum()/10000000000))
funding raised by btm 1 percent 0.0002034700 billion dollars
#category of company attracted most of investors
query="""
select iv.company_category_code,count(*) as frequency
from investments as iv
group by iv.company_category_code
order by frequency desc
limit 1
"""
investment=pd.read_sql(query,conn)
print("category: %s , frequency_investment: %d"%(investment["company_category_code"][0],investment["frequency"][0]))
category: software , frequency_investment: 7855
#category of investor contributed the money
query="""
select iv.investor_name,count(*) as frequency
from investments as iv
group by iv.investor_name
having investor_name is not Null
order by frequency desc
limit 1
"""
investor_name=pd.read_sql(query,conn)
investor_name
investor_name | frequency | |
---|---|---|
0 | Techstars | 681 |
print("investor: %s , frequency_investment: %d"%(investor_name["investor_name"][0],investor_name["frequency"][0]))
investor: Techstars , frequency_investment: 681
#category of investor contributed the most money per startup
query="""
select iv.investor_name,count(*) as frequency,
sum(raised_amount_usd) as investment
from investments as iv
group by iv.investor_name
order by investment desc
limit 1
"""
investor_money=pd.read_sql(query,conn)
print("investor: %s , investment in billons: %d"%(investor_money["investor_name"][0],investor_money["investment"][0]/10000000000))
investor: Intel , investment in billons: 2
query="""
select iv.funding_round_type as f_r_t,
count(*) as frequency
from investments as iv
group by f_r_t
order by frequency desc
limit 1
"""
funding_pop=pd.read_sql(query,conn)
print("funding popular: %s"%(funding_pop["f_r_t"][0]))
funding popular: series-a
query="""
select iv.funding_round_type as f_r_t,
count(*) as frequency
from investments as iv
group by f_r_t
having f_r_t is not Null
order by frequency asc
limit 1
"""
funding_pop=pd.read_sql(query,conn)
print("funding least popular: %s"%(funding_pop["f_r_t"][0]))
funding least popular: crowdfunding