Throughout this guided project, we'll practice working with different memory constraints.
In this step, let's assume we only have 10 megabytes of available memory.While crunchbase-investments.csv consumes 10.3 megabytes of disk space, we know from earlier missions that pandas often requires 4 to 6 times amount of space in memory as the file does on disk (especially when there's many string columns).
The data set of investments we'll be exploring is current as of October 2013. You can download it from GitHub. https://github.com/datahoarder/crunchbase-october-2013/blob/master/crunchbase-investments.csv
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
crunch_iter = pd.read_csv('crunchbase_investments.csv', encoding='ISO-8859-1', low_memory=False)
# dataframe
col_types = {
"company_name": "category", "company_category_code": "category", "company_country_code": "category",
"company_state_code": "category", "company_city": "category", "investor_name": "category",
"investor_category_code": "category", "investor_country_code": "category", "investor_state_code": "category",
"investor_city": "category", "funding_round_type": "category", "raised_amount_usd": "float64"
}
use_cols = [
"company_name", "company_category_code", "company_country_code", "company_state_code","company_city",
"investor_name", "investor_category_code", "investor_country_code", "investor_state_code", "investor_city",
"funding_round_type", "funded_at", "raised_amount_usd"
]
crunch_iter_shape = pd.read_csv('crunchbase_investments.csv', usecols=use_cols, dtype=col_types, parse_dates=["funded_at"], encoding='ISO-8859-1', chunksize=1000)
Steps:
# determine dtype for each column and memory usage in mb for each column
for crunch in crunch_iter:
print(crunch)
print(crunch_iter[crunch].dtype)
print((crunch_iter[crunch].memory_usage(deep=True)) / (2 ** 20))
print('------------------------')
company_permalink object 3.8699302673339844 ------------------------ company_name object 3.425077438354492 ------------------------ company_category_code object 3.2627410888671875 ------------------------ company_country_code object 3.0253448486328125 ------------------------ company_state_code object 2.962283134460449 ------------------------ company_region object 3.25362491607666 ------------------------ company_city object 3.343595504760742 ------------------------ investor_permalink object 4.749942779541016 ------------------------ investor_name object 3.7343921661376953 ------------------------ investor_category_code object 1.6893692016601562 ------------------------ investor_country_code object 2.704906463623047 ------------------------ investor_state_code object 2.542128562927246 ------------------------ investor_region object 3.239068031311035 ------------------------ investor_city object 2.9316816329956055 ------------------------ funding_round_type object 3.2528257369995117 ------------------------ funded_at object 3.3782129287719727 ------------------------ funded_month object 3.226959228515625 ------------------------ funded_quarter object 3.226959228515625 ------------------------ funded_year float64 0.4034881591796875 ------------------------ raised_amount_usd float64 0.4034881591796875 ------------------------
# count number of missing value counts
for crunch in crunch_iter_shape:
print(crunch)
print(crunch_iter_shape[crunch].isnull().sum())
print('------------------------')
company_name company_category_code company_country_code \ 0 AdverCar advertising USA 1 LaunchGram news USA 2 uTaP messaging USA 3 ZoopShop software USA 4 eFuneral web USA .. ... ... ... 995 SpotHero web USA 996 Cureeo ecommerce USA 997 Fibroblast enterprise USA 998 Whimseybox ecommerce USA 999 Moxie Jean ecommerce USA company_state_code company_city investor_name \ 0 CA San Francisco 1-800-FLOWERS.COM 1 CA Mountain View 10Xelerator 2 NaN NaN 10Xelerator 3 OH columbus 10Xelerator 4 OH Cleveland 10Xelerator .. ... ... ... 995 IL Chicago Excelerate Labs 996 IL Chicago Excelerate Labs 997 IL Chicago Excelerate Labs 998 TX Houston Excelerate Labs 999 IL Arlington Heights Excelerate Labs investor_category_code investor_country_code investor_state_code \ 0 NaN USA NY 1 finance USA OH 2 finance USA OH 3 finance USA OH 4 finance USA OH .. ... ... ... 995 consulting USA IL 996 consulting USA IL 997 consulting USA IL 998 consulting USA IL 999 consulting USA IL investor_city funding_round_type funded_at raised_amount_usd 0 New York series-a 2012-10-30 2000000.0 1 Columbus other 2012-01-23 20000.0 2 Columbus other 2012-01-01 20000.0 3 Columbus angel 2012-02-15 20000.0 4 Columbus other 2011-09-08 20000.0 .. ... ... ... ... 995 Chicago angel 2012-08-27 460000.0 996 Chicago angel 2012-06-01 75000.0 997 Chicago angel 2012-06-01 75000.0 998 Chicago angel 2012-06-01 75000.0 999 Chicago angel 2012-06-01 75000.0 [1000 rows x 13 columns]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-3-3672f6c6cbc3> in <module> 2 for crunch in crunch_iter_shape: 3 print(crunch) ----> 4 print(crunch_iter_shape[crunch].isnull().sum()) 5 print('------------------------') TypeError: 'TextFileReader' object is not subscriptable
# count unique values and unique value percentage of total values
overall_vc = []
for crunch in crunch_iter:
print(crunch)
num_unique_values = len(crunch_iter[crunch].unique())
num_total_values = len(crunch_iter[crunch])
print(num_unique_values / num_total_values)
print('------------------------')
overall_vc.append(crunch_iter[crunch].value_counts())
combined_vc = pd.concat(overall_vc)
print(combined_vc)
company_permalink 0.21891431813883108 ------------------------ company_name 0.21891431813883108 ------------------------ company_category_code 0.0008322299981085682 ------------------------ company_country_code 5.6742954416493286e-05 ------------------------ company_state_code 0.0009646302250803858 ------------------------ company_region 0.010346132021940608 ------------------------ company_city 0.023264611310762247 ------------------------ investor_permalink 0.19960279931908453 ------------------------ investor_name 0.19795725364100625 ------------------------ investor_category_code 0.0006430868167202572 ------------------------ investor_country_code 0.0013807452241346699 ------------------------ investor_state_code 0.0009646302250803858 ------------------------ investor_region 0.011083790429355022 ------------------------ investor_city 0.018744089275581615 ------------------------ funding_round_type 0.00018914318138831096 ------------------------ funded_at 0.053130319651976544 ------------------------ funded_month 0.003650463400794401 ------------------------ funded_quarter 0.0013807452241346699 ------------------------ funded_year 0.000397200680915453 ------------------------ raised_amount_usd 0.02759599016455457 ------------------------ /company/ecomom 58 /company/fab-com 57 /company/aperto-networks 49 /company/practice-fusion 47 /company/klout 46 .. 1204650.0 1 1644500.0 1 34995.0 1 1512066.0 1 83000000.0 1 Length: 52322, dtype: int64
tables to drop
Tables to include and their data type to specify:
# find max integer/float in column
print(crunch_iter['raised_amount_usd'].max())
3200000000.0
# mapping the memory usage of each chunk
memory_footprints = []
for chunk in crunch_iter_shape:
memory_footprints.append(chunk.memory_usage(deep=True).sum()/(2 ** 20))
plt.hist(memory_footprints)
(array([ 6., 20., 13., 11., 0., 0., 0., 1., 0., 1.]), array([0.12555218, 0.13660088, 0.14764957, 0.15869827, 0.16974697, 0.18079567, 0.19184437, 0.20289307, 0.21394176, 0.22499046, 0.23603916]), <BarContainer object of 10 artists>)
# totaling the chunks together to determine total memory used in dataframe
print(sum(memory_footprints))
7.870912551879883
# create and connecto a new sqlite database file
conn = sqlite3.connect('crunchbase_investments.db')
# export each chunk to a new table in the sqlite db
for chunk in crunch_iter_shape:
chunk.to_sql('investments', conn, if_exists='append', index=False)
# query the table and make sure the data types match up for each column
pd.read_sql('''SELECT * FROM investments;''', conn)
company_name | company_category_code | company_country_code | company_state_code | company_city | investor_name | investor_category_code | investor_country_code | investor_state_code | investor_city | funding_round_type | funded_at | raised_amount_usd | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AdverCar | advertising | USA | CA | San Francisco | 1-800-FLOWERS.COM | None | USA | NY | New York | series-a | 2012-10-30 00:00:00 | 2000000.0 |
1 | LaunchGram | news | USA | CA | Mountain View | 10Xelerator | finance | USA | OH | Columbus | other | 2012-01-23 00:00:00 | 20000.0 |
2 | uTaP | messaging | USA | None | None | 10Xelerator | finance | USA | OH | Columbus | other | 2012-01-01 00:00:00 | 20000.0 |
3 | ZoopShop | software | USA | OH | columbus | 10Xelerator | finance | USA | OH | Columbus | angel | 2012-02-15 00:00:00 | 20000.0 |
4 | eFuneral | web | USA | OH | Cleveland | 10Xelerator | finance | USA | OH | Columbus | other | 2011-09-08 00:00:00 | 20000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
52865 | Garantia Data | enterprise | USA | CA | Santa Clara | Zohar Gilon | None | None | None | None | series-a | 2012-08-08 00:00:00 | 3800000.0 |
52866 | DudaMobile | mobile | USA | CA | Palo Alto | Zohar Gilon | None | None | None | None | series-c+ | 2013-04-08 00:00:00 | 10300000.0 |
52867 | SiteBrains | software | USA | CA | San Francisco | zohar israel | None | None | None | None | angel | 2010-08-01 00:00:00 | 350000.0 |
52868 | Comprehend Systems | enterprise | USA | CA | Palo Alto | Zorba Lieberman | None | None | None | None | series-a | 2013-07-11 00:00:00 | 8400000.0 |
52869 | SmartThings | mobile | USA | DC | Minneapolis | Zorik Gordon | None | None | None | None | series-a | 2012-12-04 00:00:00 | 3000000.0 |
52870 rows × 13 columns
results_df = pd.read_sql('''PRAGMA table_info(investments);''', conn)
print(results_df)
cid name type notnull dflt_value pk 0 0 company_name TEXT 0 None 0 1 1 company_category_code TEXT 0 None 0 2 2 company_country_code TEXT 0 None 0 3 3 company_state_code TEXT 0 None 0 4 4 company_city TEXT 0 None 0 5 5 investor_name TEXT 0 None 0 6 6 investor_category_code TEXT 0 None 0 7 7 investor_country_code TEXT 0 None 0 8 8 investor_state_code TEXT 0 None 0 9 9 investor_city TEXT 0 None 0 10 10 funding_round_type TEXT 0 None 0 11 11 funded_at TIMESTAMP 0 None 0 12 12 raised_amount_usd REAL 0 None 0
# What proportion of the total amount of funds did the top 10% raise?
pd.read_sql('''
SELECT investor_name,
SUM(raised_amount_usd) AS total_investment,
(SUM(raised_amount_usd) / 681732200000) AS '%_of_total'
FROM investments
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
''', conn)
investor_name | total_investment | %_of_total | |
---|---|---|---|
0 | Kleiner Perkins Caufield & Byers | 1.121783e+10 | 0.016455 |
1 | New Enterprise Associates | 9.692542e+09 | 0.014218 |
2 | Accel Partners | 6.472126e+09 | 0.009494 |
3 | Goldman Sachs | 6.375459e+09 | 0.009352 |
4 | Sequoia Capital | 6.039402e+09 | 0.008859 |
5 | Intel | 5.969200e+09 | 0.008756 |
6 | 5.808800e+09 | 0.008521 | |
7 | Time Warner | 5.730000e+09 | 0.008405 |
8 | Comcast | 5.669000e+09 | 0.008316 |
9 | Greylock Partners | 4.960983e+09 | 0.007277 |
# top ten percent of investors
pd.read_sql('''
WITH top_ten_investors AS
(
SELECT investor_name,
SUM(raised_amount_usd) AS total_investment
FROM investments
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1046
)
SELECT SUM(tti.total_investment)
FROM top_ten_investors AS tti;
''', conn)
# total amount of top ten percent of investors = $565,443,000,000
SUM(tti.total_investment) | |
---|---|
0 | 5.654430e+11 |
pd.read_sql('''
SELECT SUM(raised_amount_usd)
FROM investments
''', conn)
# total amount raised from all investors = $681,732,200,000
SUM(raised_amount_usd) | |
---|---|
0 | 6.817322e+11 |
# top one percent of investors
pd.read_sql('''
WITH top_one_investors AS
(
SELECT investor_name,
SUM(raised_amount_usd) AS total_investment
FROM investments
GROUP BY 1
ORDER BY 2 DESC
LIMIT 105
)
SELECT SUM(tti.total_investment)
FROM top_one_investors AS tti;
''', conn)
# total amount of top one percent of investors = $276,969,900,000
SUM(tti.total_investment) | |
---|---|
0 | 2.769699e+11 |
# bottom ten percent of investors
pd.read_sql('''
WITH bottom_ten_investors AS
(
SELECT investor_name,
SUM(raised_amount_usd) AS total_investment
FROM investments
GROUP BY 1
ORDER BY 2 ASC
LIMIT 1046
)
SELECT SUM(tti.total_investment)
FROM bottom_ten_investors AS tti;
''', conn)
# total amount of bottom ten percent of investors = $21,409,138
SUM(tti.total_investment) | |
---|---|
0 | 21409138.0 |
# bottom one percent of investors
pd.read_sql('''
WITH bottom_one_investors AS
(
SELECT investor_name,
SUM(raised_amount_usd) AS total_investment
FROM investments
GROUP BY 1
ORDER BY 2 ASC
LIMIT 105
)
SELECT SUM(tti.total_investment)
FROM bottom_one_investors AS tti;
''', conn)
# total amount of bottom one percent of investors = $0.00
SUM(tti.total_investment) | |
---|---|
0 | None |
# Which category of company attracted the most investments?
pd.read_sql('''
SELECT company_category_code,
COUNT(investor_name) AS number_of_investors,
SUM(raised_amount_usd) AS total_investment
FROM investments
GROUP BY 1
ORDER BY 3 DESC
LIMIT 1
''', conn)
company_category_code | number_of_investors | total_investment | |
---|---|---|---|
0 | biotech | 4951 | 1.103964e+11 |
# Which investor contributed the most money (across all startups)?
pd.read_sql('''
SELECT investor_name,
SUM(raised_amount_usd) As total_investment
FROM investments
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
''', conn)
investor_name | total_investment | |
---|---|---|
0 | Kleiner Perkins Caufield & Byers | 1.121783e+10 |
# Which investors contributed the most money per startup?
pd.read_sql('''
SELECT investor_name,
(raised_amount_usd / company_name) AS money_per_startup
FROM investments
GROUP BY 1
ORDER BY 2 DESC
LIMIT 5;
''', conn)
investor_name | money_per_startup | |
---|---|---|
0 | Hillman Ventures | 1.300000e+07 |
1 | City Light Capital | 5.000000e+06 |
2 | KLP Enterprises | 5.000000e+06 |
3 | Nestors Financial | 3.333333e+06 |
4 | Dan Ehrman | 2.500000e+06 |
# Which funding round was the most popular?
pd.read_sql('''
SELECT funding_round_type,
COUNT(investor_name)
FROM investments
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
''', conn)
funding_round_type | COUNT(investor_name) | |
---|---|---|
0 | series-a | 13938 |
# Which funding round was the most popular?
pd.read_sql('''
SELECT company_name,
funded_at,
COUNT(investor_name),
raised_amount_usd
FROM investments
WHERE raised_amount_usd > 0
GROUP BY 2, 1, 4
ORDER BY 3 DESC
LIMIT 1;
''', conn)
company_name | funded_at | COUNT(investor_name) | raised_amount_usd | |
---|---|---|---|---|
0 | Priceonomics | 2012-05-04 00:00:00 | 29 | 1500000.0 |
# Which funding round was the least popular?
pd.read_sql('''
SELECT funding_round_type,
COUNT(investor_name)
FROM investments
WHERE funding_round_type != 'None'
GROUP BY 1
ORDER BY 2 ASC
LIMIT 1;
''', conn)
funding_round_type | COUNT(investor_name) | |
---|---|---|
0 | crowdfunding | 5 |
# Which funding round was the most popular?
pd.read_sql('''
SELECT company_name,
funded_at,
COUNT(investor_name),
raised_amount_usd
FROM investments
WHERE funded_at != 'None'
GROUP BY 2, 1, 4
ORDER BY 3 ASC
LIMIT 1;
''', conn)
company_name | funded_at | COUNT(investor_name) | raised_amount_usd | |
---|---|---|---|---|
0 | Cisco | 1987-01-01 00:00:00 | 1 | 2500000.0 |
# query the table and make sure the data types match up for each column
pd.read_sql('''
SELECT *
FROM investments
WHERE company_name = 'Socialcam';
''', conn)
company_name | company_category_code | company_country_code | company_state_code | company_city | investor_name | investor_category_code | investor_country_code | investor_state_code | investor_city | funding_round_type | funded_at | raised_amount_usd | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Socialcam | mobile | USA | CA | Santa Clara County | Atom Factory | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
1 | Socialcam | mobile | USA | CA | Santa Clara County | A-Grade Investments | None | USA | CA | Los Angeles | angel | 2012-04-30 00:00:00 | None |
2 | Socialcam | mobile | USA | CA | Santa Clara County | CrunchFund | None | USA | CA | San Francisco | angel | 2012-04-30 00:00:00 | None |
3 | Socialcam | mobile | USA | CA | Santa Clara County | Start Fund | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
4 | Socialcam | mobile | USA | CA | Santa Clara County | SV Angel | None | USA | CA | San Francisco | angel | 2012-04-30 00:00:00 | None |
5 | Socialcam | mobile | USA | CA | Santa Clara County | Alexis Ohanian | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
6 | Socialcam | mobile | USA | CA | Santa Clara County | Ari Emmanuel | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
7 | Socialcam | mobile | USA | CA | Santa Clara County | Ashton Kutcher | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
8 | Socialcam | mobile | USA | CA | Santa Clara County | Brian Chesky | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
9 | Socialcam | mobile | USA | CA | Santa Clara County | Emmett Shear | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
10 | Socialcam | mobile | USA | CA | Santa Clara County | Erik Moore | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
11 | Socialcam | mobile | USA | CA | Santa Clara County | Garry Tan | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
12 | Socialcam | mobile | USA | CA | Santa Clara County | Harjeet Taggar | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
13 | Socialcam | mobile | USA | CA | Santa Clara County | Jason Johnson | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
14 | Socialcam | mobile | USA | CA | Santa Clara County | Jeff Kapel | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
15 | Socialcam | mobile | USA | CA | Santa Clara County | Jessica Livingston | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
16 | Socialcam | mobile | USA | CA | Santa Clara County | Jonathan Abrams | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
17 | Socialcam | mobile | USA | CA | Santa Clara County | Justin Caldbeck | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
18 | Socialcam | mobile | USA | CA | Santa Clara County | Justin Kan | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
19 | Socialcam | mobile | USA | CA | Santa Clara County | Kyle Vogt | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
20 | Socialcam | mobile | USA | CA | Santa Clara County | Laurene Powell Jobs | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
21 | Socialcam | mobile | USA | CA | Santa Clara County | Matt Ocko | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
22 | Socialcam | mobile | USA | CA | Santa Clara County | Michael Abrams | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
23 | Socialcam | mobile | USA | CA | Santa Clara County | Michael Levit | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
24 | Socialcam | mobile | USA | CA | Santa Clara County | Michael Rapino | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
25 | Socialcam | mobile | USA | CA | Santa Clara County | Paul Buchheit | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
26 | Socialcam | mobile | USA | CA | Santa Clara County | Paul Graham | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
27 | Socialcam | mobile | USA | CA | Santa Clara County | Ram Shiram | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
28 | Socialcam | mobile | USA | CA | Santa Clara County | Ronny Conway | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
29 | Socialcam | mobile | USA | CA | Santa Clara County | Shane Battier | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
30 | Socialcam | mobile | USA | CA | Santa Clara County | Shervin Pishevar | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
31 | Socialcam | mobile | USA | CA | Santa Clara County | Stewart Alsop | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
32 | Socialcam | mobile | USA | CA | Santa Clara County | Timothy Draper | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
33 | Socialcam | mobile | USA | CA | Santa Clara County | Trajan Langdon | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
34 | Socialcam | mobile | USA | CA | Santa Clara County | Yuri Milner | None | None | None | None | angel | 2012-04-30 00:00:00 | None |
35 | Socialcam | mobile | USA | CA | Santa Clara County | Zachary Bogue | None | None | None | None | angel | 2012-04-30 00:00:00 | None |