import pandas as pd
pd.options.display.max_columns = 99
loans_df = pd.read_csv('loans_2007.csv', nrows=5)
loans_df
id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | last_credit_pull_d | collections_12_mths_ex_med | policy_code | application_type | acc_now_delinq | chargeoff_within_12_mths | delinq_amnt | pub_rec_bankruptcies | tax_liens | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1077501 | 1296599.0 | 5000.0 | 5000.0 | 4975.0 | 36 months | 10.65% | 162.87 | B | B2 | NaN | 10+ years | RENT | 24000.0 | Verified | Dec-2011 | Fully Paid | n | credit_card | Computer | 860xx | AZ | 27.65 | 0.0 | Jan-1985 | 1.0 | 3.0 | 0.0 | 13648.0 | 83.7% | 9.0 | f | 0.00 | 0.00 | 5863.155187 | 5833.84 | 5000.00 | 863.16 | 0.00 | 0.00 | 0.00 | Jan-2015 | 171.62 | Jun-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1077430 | 1314167.0 | 2500.0 | 2500.0 | 2500.0 | 60 months | 15.27% | 59.83 | C | C4 | Ryder | < 1 year | RENT | 30000.0 | Source Verified | Dec-2011 | Charged Off | n | car | bike | 309xx | GA | 1.00 | 0.0 | Apr-1999 | 5.0 | 3.0 | 0.0 | 1687.0 | 9.4% | 4.0 | f | 0.00 | 0.00 | 1008.710000 | 1008.71 | 456.46 | 435.17 | 0.00 | 117.08 | 1.11 | Apr-2013 | 119.66 | Sep-2013 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 1077175 | 1313524.0 | 2400.0 | 2400.0 | 2400.0 | 36 months | 15.96% | 84.33 | C | C5 | NaN | 10+ years | RENT | 12252.0 | Not Verified | Dec-2011 | Fully Paid | n | small_business | real estate business | 606xx | IL | 8.72 | 0.0 | Nov-2001 | 2.0 | 2.0 | 0.0 | 2956.0 | 98.5% | 10.0 | f | 0.00 | 0.00 | 3005.666844 | 3005.67 | 2400.00 | 605.67 | 0.00 | 0.00 | 0.00 | Jun-2014 | 649.91 | Jun-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 1076863 | 1277178.0 | 10000.0 | 10000.0 | 10000.0 | 36 months | 13.49% | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years | RENT | 49200.0 | Source Verified | Dec-2011 | Fully Paid | n | other | personel | 917xx | CA | 20.00 | 0.0 | Feb-1996 | 1.0 | 10.0 | 0.0 | 5598.0 | 21% | 37.0 | f | 0.00 | 0.00 | 12231.890000 | 12231.89 | 10000.00 | 2214.92 | 16.97 | 0.00 | 0.00 | Jan-2015 | 357.48 | Apr-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 1075358 | 1311748.0 | 3000.0 | 3000.0 | 3000.0 | 60 months | 12.69% | 67.79 | B | B5 | University Medical Group | 1 year | RENT | 80000.0 | Source Verified | Dec-2011 | Current | n | other | Personal | 972xx | OR | 17.94 | 0.0 | Jan-1996 | 0.0 | 15.0 | 0.0 | 27783.0 | 53.9% | 38.0 | f | 461.73 | 461.73 | 3581.120000 | 3581.12 | 2538.27 | 1042.85 | 0.00 | 0.00 | 0.00 | Jun-2016 | 67.79 | Jun-2016 | 0.0 | 1.0 | INDIVIDUAL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
loans_df = pd.read_csv('loans_2007.csv', nrows=3000)
loans_df.info(memory_usage='deep', verbose=False)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3000 entries, 0 to 2999 Columns: 52 entries, id to tax_liens dtypes: float64(30), int64(1), object(21) memory usage: 4.6 MB
loans_df.memory_usage(deep=True).sum()/2**20
4.649013519287109
loans_df.columns
Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens'], dtype='object')
Following will be shown here:
These lists will be generated here:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
total_memory_consumption = 0
columns_specifications = {}
for chunk in chunk_iter:
for column in chunk.columns:
if column in columns_specifications:
columns_specifications[column]['memory'] += chunk[column].memory_usage(deep=True)/2**20
columns_specifications[column]['missing_values'] += chunk[column].isnull().sum()
columns_specifications[column]['unique_values'] += chunk[column].nunique()
columns_specifications[column]['total_values'] += len(chunk.index)
else:
columns_specifications[column] = {}
columns_specifications[column]['memory'] = chunk[column].memory_usage(deep=True)/2**20
columns_specifications[column]['missing_values'] = chunk[column].isnull().sum()
columns_specifications[column]['dtype'] = str(chunk[column].dtype)
columns_specifications[column]['unique_values'] = chunk[column].nunique()
columns_specifications[column]['total_values'] = len(chunk.index)
total_memory_consumption += chunk.memory_usage(deep=True).sum()/2**20
print('total_memory_consumption: {value} MB'.format(value=round(total_memory_consumption,1)))
count_float = 0
count_string = 0
category_columns = []
float_columns = []
for column in columns_specifications:
if columns_specifications[column]['missing_values'] == 0:
print(column, columns_specifications[column])
if columns_specifications[column]['dtype'] == 'float64':
count_float += 1
float_columns.append(column)
if columns_specifications[column]['dtype'] == 'object':
count_string += 1
port = round(100*(columns_specifications[column]['unique_values']/columns_specifications[column]['total_values']),1)
if port < 50:
#print('{value} -- ({portion}%) less than 50% of the values are unique'.format(value=column, portion=port))
if port < 2:
category_columns.append(column)
print('Number of float64 columns: {value}'.format(value=count_float))
print('Number of string columns: {value}'.format(value=count_string))
total_memory_consumption: 66.2 MB id {'memory': 0.5113277435302734, 'unique_values': 42538, 'missing_values': 0, 'dtype': 'int64', 'total_values': 42538} Number of float64 columns: 30 Number of string columns: 21
the following columns (with less than 2 % of different entries) are selected for optimising memory footprint by changing the dtype to category
print(category_columns)
['verification_status', 'addr_state', 'initial_list_status', 'issue_d', 'term', 'sub_grade', 'emp_length', 'pymnt_plan', 'application_type', 'home_ownership', 'loan_status', 'grade', 'purpose']
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
total_memory_consumption = 0
for chunk in chunk_iter:
for column in category_columns:
chunk[column] = chunk[column].astype('category')
total_memory_consumption += chunk.memory_usage(deep=True).sum()/2**20
print('total_memory_consumption: {value} MB'.format(value=round(total_memory_consumption,1)))
total_memory_consumption: 32.4 MB
of the memory footprint by changing dtypes of further columns
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
total_memory_consumption = 0
for chunk in chunk_iter:
for column in category_columns:
chunk[column] = chunk[column].astype('category')
chunk['int_rate'] = chunk['int_rate'].str.replace('%','')
chunk['int_rate'] = chunk['int_rate'].astype('float32')
chunk['revol_util'] = chunk['revol_util'].str.replace('%','')
chunk['revol_util'] = chunk['revol_util'].astype('float32')
chunk['zip_code'] = chunk['zip_code'].str.replace('xx','')
chunk['zip_code'] = chunk['zip_code'].astype('float16')
total_memory_consumption += chunk.memory_usage(deep=True).sum()/2**20
print('total_memory_consumption: {value} MB'.format(value=round(total_memory_consumption,1)))
total_memory_consumption: 25.2 MB
date_columns = ['issue_d','earliest_cr_line','last_pymnt_d','last_credit_pull_d']
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, parse_dates=date_columns)
total_memory_consumption = 0
for chunk in chunk_iter:
for column in category_columns:
chunk[column] = chunk[column].astype('category')
chunk['int_rate'] = chunk['int_rate'].str.replace('%','')
chunk['int_rate'] = chunk['int_rate'].astype('float32')
chunk['revol_util'] = chunk['revol_util'].str.replace('%','')
chunk['revol_util'] = chunk['revol_util'].astype('float32')
chunk['zip_code'] = chunk['zip_code'].str.replace('xx','')
chunk['zip_code'] = chunk['zip_code'].astype('float16')
for column in date_columns:
chunk[column] = pd.to_datetime(chunk[column])
total_memory_consumption += chunk.memory_usage(deep=True).sum()/2**20
print('total_memory_consumption: {value} MB'.format(value=round(total_memory_consumption,1)))
total_memory_consumption: 18.6 MB
Following columns were identified:
print(float_columns)
['delinq_2yrs', 'recoveries', 'pub_rec_bankruptcies', 'total_pymnt_inv', 'open_acc', 'loan_amnt', 'chargeoff_within_12_mths', 'collection_recovery_fee', 'member_id', 'total_pymnt', 'policy_code', 'dti', 'acc_now_delinq', 'total_rec_late_fee', 'revol_bal', 'last_pymnt_amnt', 'out_prncp', 'out_prncp_inv', 'total_rec_int', 'funded_amnt_inv', 'annual_inc', 'installment', 'pub_rec', 'collections_12_mths_ex_med', 'total_acc', 'total_rec_prncp', 'inq_last_6mths', 'funded_amnt', 'tax_liens', 'delinq_amnt']
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, parse_dates=date_columns)
float_columns_max = {}
for chunk in chunk_iter:
for column in float_columns:
if column in float_columns_max:
float_columns_max[column].append(chunk[column].max())
else:
float_columns_max[column] = [chunk[column].max()]
for column in float_columns_max:
datatype = 'float64'
if 65500.0 <= max(float_columns_max[column]) < 3.4028235e+38:
datatype = 'float32'
elif max(float_columns_max[column]) < 65500.0:
datatype = 'float16'
float_columns_max[column] = [datatype, max(float_columns_max[column]), min(float_columns_max[column])]
print(float_columns_max)
{'total_rec_prncp': ['float16', 35000.02, 25000.0], 'delinq_amnt': ['float16', 6053.0, 0.0], 'collection_recovery_fee': ['float16', 7002.19, 1608.17], 'delinq_2yrs': ['float16', 13.0, 4.0], 'chargeoff_within_12_mths': ['float16', 0.0, 0.0], 'tax_liens': ['float16', 1.0, 0.0], 'pub_rec_bankruptcies': ['float16', 2.0, 1.0], 'installment': ['float16', 1305.19, 902.06], 'total_pymnt_inv': ['float16', 58563.68, 20319.6], 'recoveries': ['float16', 29623.35, 9209.52], 'collections_12_mths_ex_med': ['float16', 0.0, 0.0], 'total_acc': ['float16', 90.0, 63.0], 'pub_rec': ['float16', 5.0, 1.0], 'inq_last_6mths': ['float16', 33.0, 6.0], 'revol_bal': ['float32', 1207359.0, 131949.0], 'loan_amnt': ['float16', 35000.0, 25000.0], 'member_id': ['float32', 1314167.0, 247257.0], 'total_pymnt': ['float16', 58563.6799293133, 31309.067428487], 'policy_code': ['float16', 1.0, 1.0], 'dti': ['float16', 29.99, 24.95], 'acc_now_delinq': ['float16', 1.0, 0.0], 'total_rec_late_fee': ['float16', 209.000000006491, 79.011447549853], 'funded_amnt': ['float16', 35000.0, 25000.0], 'open_acc': ['float16', 47.0, 30.0], 'last_pymnt_amnt': ['float16', 36115.2, 23913.24], 'total_rec_int': ['float16', 23611.1, 6788.95], 'out_prncp': ['float16', 5794.29, 0.0], 'out_prncp_inv': ['float16', 5794.29, 0.0], 'annual_inc': ['float32', 6000000.0, 450000.0], 'funded_amnt_inv': ['float16', 35000.0, 15550.0]}
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, parse_dates=date_columns)
total_memory_consumption = 0
for chunk in chunk_iter:
for column in float_columns_max:
chunk[column] = chunk[column].astype(float_columns_max[column][0])
for column in category_columns:
chunk[column] = chunk[column].astype('category')
for column in date_columns:
chunk[column] = pd.to_datetime(chunk[column])
chunk['int_rate'] = chunk['int_rate'].str.replace('%','')
chunk['int_rate'] = chunk['int_rate'].astype('float32')
chunk['revol_util'] = chunk['revol_util'].str.replace('%','')
chunk['revol_util'] = chunk['revol_util'].astype('float32')
chunk['zip_code'] = chunk['zip_code'].str.replace('xx','')
chunk['zip_code'] = chunk['zip_code'].astype('float16')
total_memory_consumption += chunk.memory_usage(deep=True).sum()/2**20
print('total_memory_consumption: {value} MB'.format(value=round(total_memory_consumption,1)))
total_memory_consumption: 11.5 MB