#!/usr/bin/env python # coding: utf-8 # # Analyzing Startup Fundraising Deals from Crunchbase # # We will analyze startup investments from Crunchbase.com. # # Every year, thousands of startup companies raise financing from investors. Each time a startup raises money, we refer to the event as a fundraising round. Crunchbase is a website that crowdsources information on the fundraising rounds of many startups. The Crunchbase user community submits, edits, and maintains most of the information in Crunchbase. # # In return, Crunchbase makes the data available through a Web application and a fee-based API. Before Crunchbase switched to the paid API model, multiple groups crawled the site and released the data online. Because the information on the startups and their fundraising rounds is always changing, the data set we'll be using isn't completely up to date. # # The data set of investments we'll be exploring is current as of October 2013. You can download it from [GitHub]. # # Throughout this guided project, we'll practice working with different memory constraints. In this step, let's assume we only have 10 megabytes of available memory. While `crunchbase-investments.csv` consumes 10.3 megabytes of disk space, we know from earlier missions that pandas often requires 4 to 6 times amount of space in memory as the file does on disk (especially when there's many string columns). # # [Crunchbase]: https://www.crunchbase.com/ # [Github]: https://github.com/datahoarder/crunchbase-october-2013/blob/master/crunchbase-investments.csv # In[1]: import sqlite3 import pandas as pd import pprint as pp import matplotlib.pyplot as plt pd.options.display.max_columns = 99 # In[2]: get_ipython().system('ls') # ## Introduction # # - Because the data set contains over 50,000 rows, you'll need to read the data set into dataframes using 5,000 row chunks to ensure that each chunk consumes much less than 10 megabytes of memory. # - Across all of the chunks, become familiar with: # - Each column's missing value counts # - Each column's memory footprint # - The total memory footprint of all of the chunks combined # - Which column(s) we can drop because they aren't useful for analysis # ### Check Chunk's Memory # In[3]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1') memory_footprints = [] i = 0 for chunk in chunks_it: i += 1 chunk_memory_mb = chunk.memory_usage(deep=True).sum() / 1048576 memory_footprints.append(chunk_memory_mb) print("Chunk {0} memory: {1:0.2f} mb".format(i, chunk_memory_mb)) plt.hist(memory_footprints) plt.show() # **Observations**: # # - Each chunk is under the 10 MB of memory requirement we had set # - Each chunk has ~5.5 MB except the last 2 chunks # ### Check each column's missing value counts across all chunks # In[4]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1') missing_vc_list = [] i = 0 for chunk in chunks_it: i += 1 #print("Chunk {0}:".format(i)) result = chunk.isnull().sum() #print(type(result)) missing_vc_list.append(result) combined_missing_vc = pd.concat(missing_vc_list) unique_combined_missing_vc = combined_missing_vc.groupby(combined_missing_vc.index).sum() unique_combined_missing_vc.sort_values() # **Observations**: # # - There is a lot of investor related information missing across the dataset, specifically investor state code, investor category code, investor city and investory country code # ### Check each column's memory footprint # In[5]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1') series_memory_fp = pd.Series() counter = 0 for chunk in chunks_it: if counter == 0: series_memory_fp = chunk.memory_usage(deep=True) else: series_memory_fp += chunk.memory_usage(deep=True) counter += 1 series_memory_fp = series_memory_fp.drop('Index') series_memory_fp # ### The total memory footprint of all of the chunks combined # In[6]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1') memory_footprints = [] i = 0 for chunk in chunks_it: i += 1 chunk_memory_mb = chunk.memory_usage(deep=True).sum() / 1048576 memory_footprints.append(chunk_memory_mb) print("Chunk {0} memory: {1:0.2f} mb".format(i, chunk_memory_mb)) print("Total memory: {0:0.2f} mb".format(sum(memory_footprints))) # In[7]: series_memory_fp.sum() / (1048576) # ### Which column(s) we can drop because they aren't useful for analysis # # Let's drop columns that aren't very helpful (URLs) or columns that have too many missing columns (`> 90%`) # In[8]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1') total_num_rows = 0 for chunk in chunks_it: total_num_rows += chunk.shape[0] print("Total Rows: {0}".format(total_num_rows)) # In[9]: # get percentage of missing values per column unique_combined_missing_vc.sort_values() / total_num_rows * 100 # In[12]: # Drop columns representing URLs or containing too many missing values ( > 90%) drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code'] keep_cols = chunk.columns.drop(drop_cols) print('Columns to Keep:') print(keep_cols.tolist()) # --- # ## Selecting Data Types # # Let's get familiar with the column types before adding the data into SQLite. # # - Identify the types for each column. # - Identify the numeric columns we can represent using more space efficient types. # - For text columns: # - Analyze the unique value counts across all of the chunks to see if we can convert them to a numeric type. # - See if we clean clean any text columns and separate them into multiple numeric columns without adding any overhead when querying. # - Make your changes to the code from the last step so that the overall memory the data consumes stays under 10 megabytes. # ### Types for each column # In[13]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) col_types = {} i = 0 for chunk in chunks_it: i += 1 #print("Chunk {0}:".format(i)) #print(chunk.dtypes) for col in chunk.columns: if col not in col_types: # use a set to keep only unique values! col_types[col] = set([str(chunk.dtypes[col])]) else: col_types[col].add(str(chunk.dtypes[col])) #print('\t', col, '|', col_types[col]) #break pp.pprint(col_types) # **Observations**: # # - A few columns have several types: `funded_year`, `investor_city`, `investor_country_code`, `investor_state_code` # - There are a number of **object** type columns which could be converted to more appropriate, space efficient data types # ### Identify the numeric columns we can represent using more space efficient types # In[14]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) count = 0 for chunk in chunks_it: count += 1 print(f'\nChunk {count:<3}:', end=' ') float_cols = chunk.select_dtypes(include=['float', 'int']).columns print(float_cols) chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float') # **Observations**: # # - The `raised_amount_usd` is the only consistent float column across all the chunks. We could use a more space efficient type for it. # # Let's dig a little deeper and look at the unique values within the numeric columns. # In[15]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) count = 0 str_cols_vc = {} for chunk in chunks_it: count += 1 print(f'\nCHUNK {count}:') num_cols = chunk.select_dtypes(include=['float', 'int']) for col in num_cols: print(f'\n{col}') current_col_vc = num_cols[col].value_counts() #continue if col in str_cols_vc: str_cols_vc[col].append(current_col_vc) else: str_cols_vc[col] = [current_col_vc] unique_values = chunk[col].unique() print(f'type: {chunk[col].dtype} | {len(unique_values)} values: {unique_values[:5]}') #break ## Combine the value count results combined_vcs = {} for col, vc_list in str_cols_vc.items(): combined_vc = pd.concat(vc_list) final_vc = combined_vc.groupby(combined_vc.index).sum() combined_vcs[col] = final_vc #pp.pprint(combined_vcs) # **Observations**: # # - The `funded_year` column can be converted from float to integer as it shows the funded year # - There are only 2 numeric columns - `funded_year` and `raised_amount_usd`. The remaining columns are showing up because they have Nan values in the later chunks. # - The `raised_amount_usd` column can be downcast as a float to a more space efficient type. # - Addition: The `funded_at` column has the necessary year, month, day info - so the `funded_year` is no longer required! # ### Text Columns # # - Analyze the unique value counts across all of the chunks to see if we can convert them to a numeric type. # - See if we clean clean any text columns and separate them into multiple numeric columns without adding any overhead when querying. # In[16]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) count = 0 #total_num_rows = 0 str_cols_vc = {} for chunk in chunks_it: count += 1 #total_num_rows += chunk.shape[0] print(f'\nCHUNK {count}:') #print(chunk.dtypes.value_counts()) obj_cols = chunk.select_dtypes(include=['object']) #print(obj_cols.value_counts()) #print(type(obj_cols)) for col in obj_cols: current_col_vc = obj_cols[col].value_counts() if col in str_cols_vc: str_cols_vc[col].append(current_col_vc) else: str_cols_vc[col] = [current_col_vc] unique_values = chunk[col].unique() print(f'col: {col} | type: {chunk[col].dtype} |'\ f' {len(unique_values)} values: {unique_values[:5]}') #print(f'Total Num Rows: {total_num_rows}') ## Combine the value count results combined_vcs = {} for col, vc_list in str_cols_vc.items(): combined_vc = pd.concat(vc_list) final_vc = combined_vc.groupby(combined_vc.index).sum() combined_vcs[col] = final_vc #pp.pprint(combined_vcs) # In[17]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) count = 0 #total_num_rows = 0 str_cols_vc = {} for chunk in chunks_it: count += 1 #total_num_rows += chunk.shape[0] #print(f'\nChunk {count}:') #print(chunk.dtypes.value_counts()) #obj_cols = chunk.select_dtypes(include=['object']) #print(obj_cols.value_counts()) #print(type(obj_cols)) #columns = ['investor_city', 'investor_country_code', # 'investor_state_code'] for col in chunk: current_col_vc = chunk[col].value_counts() if col in str_cols_vc: str_cols_vc[col].append(current_col_vc) else: str_cols_vc[col] = [current_col_vc] #print(f'Total Num Rows: {total_num_rows}') ## Combine the value count results combined_vcs = {} for col, vc_list in str_cols_vc.items(): combined_vc = pd.concat(vc_list) final_vc = combined_vc.groupby(combined_vc.index).sum() combined_vcs[col] = final_vc pp.pprint(combined_vcs.keys()) print('\nAll Value Counts:') pp.pprint(combined_vcs) # **Observations**: # # - The `funded_month`, `funded_quarter`, `funded_year` columns seem unnecessary as we already have a `funded_at` column which has the exact funding date. The other 2 columns can be easily derived from the latter column. # - The `funded_at` column can be converted into a datetime type column # - There are more opportunities to clean up the data further. As a potential extension of this project, a deeper dive into the data to further understand and clean up the data would be good. # In[30]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) total_num_rows = 0 for chunk in chunks_it: total_num_rows += chunk.shape[0] print("Total Rows: {0}".format(total_num_rows)) # In[20]: # Drop columns representing URLs or containing too many missing values ( > 90%) # or the 2 unnecessary fund date related columns we identified drop_cols = ['funded_month', 'funded_quarter', 'funded_year'] keep_cols = chunk.columns.drop(drop_cols) print(keep_cols.tolist()) # In[21]: # Memory Comparison - Before vs After Type Conversion chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) counter = 0 total_initial_memory_mb = 0 total_final_memory_mb = 0 for chunk in chunks_it: counter += 1 print("\nCHUNK {0}:".format(counter)) total_initial_memory_mb += (chunk.memory_usage(deep=True).sum() / 1048576) #print('Initial Memory Usage:') memory_comparison = [] initial_memory_mb = 0 for col in chunk: chunk_memory_mb = chunk[col].memory_usage(deep=True) / 1048576 initial_memory_mb += chunk_memory_mb initial_status = f'{col:<30}: {chunk_memory_mb:0.2f} ({chunk[col].dtype})' memory_comparison.append(initial_status) #for col in float_cols.columns: chunk['raised_amount_usd'] = pd.to_numeric(chunk['raised_amount_usd'], downcast='float') # convert object columns to datetime type chunk['funded_at'] = pd.to_datetime(chunk['funded_at']) total_memory_mb = 0 for i, col in enumerate(chunk): chunk_memory_mb = chunk[col].memory_usage(deep=True) / 1048576 total_memory_mb += chunk_memory_mb previous_info = memory_comparison[i] print(f'{previous_info:<10} vs. {chunk_memory_mb:0.2f} ({chunk[col].dtype})') print(f'Memory: {initial_memory_mb:0.2f} mb vs {total_memory_mb:0.2f} mb') total_final_memory_mb += (chunk.memory_usage(deep=True).sum() / 1048576) print(f'\nMemory:{total_initial_memory_mb:0.2f} mb vs {total_final_memory_mb:0.2f} mb') # **Observations**: # # By converting the 2 columns (`raised_amount_usd` to a more space efficient float type and `funded_at` to a datetime type), we saved ~3 MB from the overall data. Not a significant amount. There are opportunities to convert some of the other columns into categories to further save space in the dataframe, however I am not sure how this would translate to more space saved on the SQLite database. # --- # ## Loading Chunks with SQLite # # The next step is to load each chunk into a table in a SQLite database so we can query the full data set. # # - Create and connect to a new SQLite database file. # - Expand on the existing chunk processing code to export each chunk to a new table in the SQLite database. # - Query the table and make sure the data types match up to what you had in mind for each column. # - Use the `!wc IPython` command to return the file size of the database. # In[41]: chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) total_size_list = [] for chunk in chunks_it: total_size_list.append(chunk.shape) print(f"Total Size List: '{total_size_list}'") # We verify that our rows add up to 52,870: # # total_rows = $(5000 * 10) + 2,870$ # # We have 14 columns. # In[22]: conn = sqlite3.connect('crunchbase.db') chunks_it = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols) counter = 0 for chunk in chunks_it: #print(chunk.columns.tolist()) counter += 1 print(f"Loading chunk {counter}...") chunk.to_sql("investments", conn, if_exists='append', index=False) #break conn.close() print("Completed loading all chunks to crunchbase.db!") # In[23]: get_ipython().system('ls') # Successfully created database, `crunchbase.db`. # In[25]: # SQLite Helper Functions def run_query(query): with sqlite3.connect('crunchbase.db') as conn: return pd.read_sql(query, conn) def show_tables(): query = ''' SELECT name, type FROM sqlite_master WHERE type IN ("table", "view"); ''' return run_query(query) # In[26]: show_tables() # In[27]: df = run_query('SELECT * from investments') df.head() # In[28]: get_ipython().system('wc crunchbase.db') # **Note**: # # Earlier, I calculated 52870 rows across all the chunks. The database ends up having only 4773 rows. # # **Update**: After a quick double check, I realized the number of rows returned by `wc` command doesn't correspond to the number of lines in our SQL table. Below we verified our row and column numbers match up with our expectations. # In[43]: df = run_query('SELECT count(*) as num_rows from investments') df # In[34]: # Table Columns info df = run_query('PRAGMA table_info(investments)') df # --- # ## Data Exploration and Analysis # # We can use the pandas SQLite workflow we learned in the last mission to explore and analyze startup investments. Remember that each row isn't a unique company, but a unique investment from a single investor. This means that many startups will span multiple rows. # # - Use the pandas SQLite workflow to answer the following questions: # - What proportion of the total amount of funds did the top 10% raise? What about the top 1%? Compare these values to the proportions the bottom 10% and bottom 1% raised. # - Which category of company attracted the most investments? # - Which investor contributed the most money (across all startups)? # - Which investors contributed the most money per startup? # - Which funding round was the most popular? Which was the least popular? # ### Helper Functions # In[31]: def run_query(query): with sqlite3.connect('crunchbase.db') as conn: return pd.read_sql(query, conn) def run_command(command): with sqlite3.connect('crunchbase.db') as conn: conn.isolation_level = None conn.execute(command) def show_tables(): query = ''' SELECT name, type FROM sqlite_master WHERE type IN ("table", "view"); ''' return run_query(query) # In[36]: show_tables() # In[53]: query = ''' SELECT * FROM investments LIMIT 10''' run_query(query) # In[107]: query = '''SELECT * FROM investments''' investments = run_query(query) investments.shape # In[108]: investments.head() # In[80]: # verify unique companies unique_companies = list(investments['company_name'].unique()) len(unique_companies) # In[90]: # Found the additional row - this messed up the # unique companies list above with the unique startups list # below investments[investments['company_name'].isnull()] # ### Group Investments by Startup # In[62]: funds_raised_per_startup = investments.groupby('company_name')['raised_amount_usd'].sum() # In[63]: funds_raised_per_startup.head() # In[65]: funds_raised_per_startup.shape # The reason for the difference between this number and the earlier number (11574) is because there is a row with all values set as None. We can safely ignore this discrepancy. # # **Note**: Obviously pandas `groupby` function would not keep a None value. # In[95]: #suppress displaying long numbers in scientific notation pd.set_option('display.float_format', lambda x: '%.2f' % x) # In[103]: #reset option pd.reset_option('display.float_format') # In[104]: funds_raised_per_startup.sort_values(ascending=False) # In[102]: investments.groupby('company_name')['raised_amount_usd'].sum().sort_values() # I'm getting 29,680,000,000 raised for Clearwire which seems to be off by a power of 10. As of 2009, [Clearwire had raised $2.8 billion](https://www.ecnmag.com/news/2009/11/clearwires-fundraising-hits-28-billion). # # **NOT $29 billion** # In[110]: clearwire_investments = investments[investments['company_name'] == 'Clearwire'] # In[111]: clearwire_investments # In[115]: list(clearwire_investments['raised_amount_usd']) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # --- # ## Next Steps # In[ ]: