Instructions:
Study of the columns for memory optimization:
Actions: term - remove the months string int_rate / revol_util - remove the % sign grade / sub_grade / home_ownership / verification_status / loan_status / pymnt_plan / purpose - category? addr_state / initial_list_status / application_type
emp_length - remove the string objects issue_d / earliest_cr_line / last_pymnt_d / last_credit_pull_d / -> convert to date zip_code -> check if possible to convert to integer
import pandas as pd
df = pd.read_csv(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv", nrows=5)
for n, row in df.iterrows():
print(row)
id 1077501 member_id 1296599.0 loan_amnt 5000.0 funded_amnt 5000.0 funded_amnt_inv 4975.0 term 36 months int_rate 10.65% installment 162.87 grade B sub_grade B2 emp_title NaN emp_length 10+ years home_ownership RENT annual_inc 24000.0 verification_status Verified issue_d Dec-2011 loan_status Fully Paid pymnt_plan n purpose credit_card title Computer zip_code 860xx addr_state AZ dti 27.65 delinq_2yrs 0.0 earliest_cr_line Jan-1985 inq_last_6mths 1.0 open_acc 3.0 pub_rec 0.0 revol_bal 13648.0 revol_util 83.7% total_acc 9.0 initial_list_status f out_prncp 0.0 out_prncp_inv 0.0 total_pymnt 5863.155187 total_pymnt_inv 5833.84 total_rec_prncp 5000.0 total_rec_int 863.16 total_rec_late_fee 0.0 recoveries 0.0 collection_recovery_fee 0.0 last_pymnt_d Jan-2015 last_pymnt_amnt 171.62 last_credit_pull_d Jun-2016 collections_12_mths_ex_med 0.0 policy_code 1.0 application_type INDIVIDUAL acc_now_delinq 0.0 chargeoff_within_12_mths 0.0 delinq_amnt 0.0 pub_rec_bankruptcies 0.0 tax_liens 0.0 Name: 0, dtype: object id 1077430 member_id 1314167.0 loan_amnt 2500.0 funded_amnt 2500.0 funded_amnt_inv 2500.0 term 60 months int_rate 15.27% installment 59.83 grade C sub_grade C4 emp_title Ryder emp_length < 1 year home_ownership RENT annual_inc 30000.0 verification_status Source Verified issue_d Dec-2011 loan_status Charged Off pymnt_plan n purpose car title bike zip_code 309xx addr_state GA dti 1.0 delinq_2yrs 0.0 earliest_cr_line Apr-1999 inq_last_6mths 5.0 open_acc 3.0 pub_rec 0.0 revol_bal 1687.0 revol_util 9.4% total_acc 4.0 initial_list_status f out_prncp 0.0 out_prncp_inv 0.0 total_pymnt 1008.71 total_pymnt_inv 1008.71 total_rec_prncp 456.46 total_rec_int 435.17 total_rec_late_fee 0.0 recoveries 117.08 collection_recovery_fee 1.11 last_pymnt_d Apr-2013 last_pymnt_amnt 119.66 last_credit_pull_d Sep-2013 collections_12_mths_ex_med 0.0 policy_code 1.0 application_type INDIVIDUAL acc_now_delinq 0.0 chargeoff_within_12_mths 0.0 delinq_amnt 0.0 pub_rec_bankruptcies 0.0 tax_liens 0.0 Name: 1, dtype: object id 1077175 member_id 1313524.0 loan_amnt 2400.0 funded_amnt 2400.0 funded_amnt_inv 2400.0 term 36 months int_rate 15.96% installment 84.33 grade C sub_grade C5 emp_title NaN emp_length 10+ years home_ownership RENT annual_inc 12252.0 verification_status Not Verified issue_d Dec-2011 loan_status Fully Paid pymnt_plan n purpose small_business title real estate business zip_code 606xx addr_state IL dti 8.72 delinq_2yrs 0.0 earliest_cr_line Nov-2001 inq_last_6mths 2.0 open_acc 2.0 pub_rec 0.0 revol_bal 2956.0 revol_util 98.5% total_acc 10.0 initial_list_status f out_prncp 0.0 out_prncp_inv 0.0 total_pymnt 3005.666844 total_pymnt_inv 3005.67 total_rec_prncp 2400.0 total_rec_int 605.67 total_rec_late_fee 0.0 recoveries 0.0 collection_recovery_fee 0.0 last_pymnt_d Jun-2014 last_pymnt_amnt 649.91 last_credit_pull_d Jun-2016 collections_12_mths_ex_med 0.0 policy_code 1.0 application_type INDIVIDUAL acc_now_delinq 0.0 chargeoff_within_12_mths 0.0 delinq_amnt 0.0 pub_rec_bankruptcies 0.0 tax_liens 0.0 Name: 2, dtype: object id 1076863 member_id 1277178.0 loan_amnt 10000.0 funded_amnt 10000.0 funded_amnt_inv 10000.0 term 36 months int_rate 13.49% installment 339.31 grade C sub_grade C1 emp_title AIR RESOURCES BOARD emp_length 10+ years home_ownership RENT annual_inc 49200.0 verification_status Source Verified issue_d Dec-2011 loan_status Fully Paid pymnt_plan n purpose other title personel zip_code 917xx addr_state CA dti 20.0 delinq_2yrs 0.0 earliest_cr_line Feb-1996 inq_last_6mths 1.0 open_acc 10.0 pub_rec 0.0 revol_bal 5598.0 revol_util 21% total_acc 37.0 initial_list_status f out_prncp 0.0 out_prncp_inv 0.0 total_pymnt 12231.89 total_pymnt_inv 12231.89 total_rec_prncp 10000.0 total_rec_int 2214.92 total_rec_late_fee 16.97 recoveries 0.0 collection_recovery_fee 0.0 last_pymnt_d Jan-2015 last_pymnt_amnt 357.48 last_credit_pull_d Apr-2016 collections_12_mths_ex_med 0.0 policy_code 1.0 application_type INDIVIDUAL acc_now_delinq 0.0 chargeoff_within_12_mths 0.0 delinq_amnt 0.0 pub_rec_bankruptcies 0.0 tax_liens 0.0 Name: 3, dtype: object id 1075358 member_id 1311748.0 loan_amnt 3000.0 funded_amnt 3000.0 funded_amnt_inv 3000.0 term 60 months int_rate 12.69% installment 67.79 grade B sub_grade B5 emp_title University Medical Group emp_length 1 year home_ownership RENT annual_inc 80000.0 verification_status Source Verified issue_d Dec-2011 loan_status Current pymnt_plan n purpose other title Personal zip_code 972xx addr_state OR dti 17.94 delinq_2yrs 0.0 earliest_cr_line Jan-1996 inq_last_6mths 0.0 open_acc 15.0 pub_rec 0.0 revol_bal 27783.0 revol_util 53.9% total_acc 38.0 initial_list_status f out_prncp 461.73 out_prncp_inv 461.73 total_pymnt 3581.12 total_pymnt_inv 3581.12 total_rec_prncp 2538.27 total_rec_int 1042.85 total_rec_late_fee 0.0 recoveries 0.0 collection_recovery_fee 0.0 last_pymnt_d Jun-2016 last_pymnt_amnt 67.79 last_credit_pull_d Jun-2016 collections_12_mths_ex_med 0.0 policy_code 1.0 application_type INDIVIDUAL acc_now_delinq 0.0 chargeoff_within_12_mths 0.0 delinq_amnt 0.0 pub_rec_bankruptcies 0.0 tax_liens 0.0 Name: 4, dtype: object
df = pd.read_csv(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv", nrows=3250)
df.memory_usage(deep=True).sum() / (1024 * 1024)
#Reading 3,250 rows reaches 5 MB memory consumption
4.962096214294434
Actions: term - remove the months string int_rate / revol_util - remove the % sign grade / sub_grade / home_ownership / verification_status / loan_status / pymnt_plan / purpose - category? addr_state / initial_list_status / application_type
emp_length - remove the string objects issue_d / earliest_cr_line / last_pymnt_d / last_credit_pull_d / -> convert to date zip_code -> check if possible to convert to integer
import re
cols_digits = ["term", "int_rate", "revol_util", "emp_length"]
cols_date = ["issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d"]
def extract_digits(col):
col = col.str.extract(r"(\d+)", expand=False)
return col
renaming = {"term":"term months", "int_rate":"int_rate %", "revol_util": "revol_util%", "emp_length":"emp_length years"}
df[cols_date] = df[cols_date].apply(pd.to_datetime)
df[cols_digits] = df[cols_digits].apply(extract_digits).apply(pd.to_numeric)
df = df.rename(columns=renaming)
df.head()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /var/folders/kk/v47zgwqd72z5_5_wynlh914m0000gn/T/ipykernel_6695/2526326875.py in <module> 12 13 df[cols_date] = df[cols_date].apply(pd.to_datetime) ---> 14 df[cols_digits] = df[cols_digits].apply(extract_digits).apply(pd.to_numeric) 15 16 df = df.rename(columns=renaming) ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in __getitem__(self, key) 3028 if is_iterator(key): 3029 key = list(key) -> 3030 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1] 3031 3032 # take() does not accept boolean indexers ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing) 1264 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) 1265 -> 1266 self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing) 1267 return keyarr, indexer 1268 ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing) 1306 if missing == len(indexer): 1307 axis_name = self.obj._get_axis_name(axis) -> 1308 raise KeyError(f"None of [{key}] are in the [{axis_name}]") 1309 1310 ax = self.obj._get_axis(axis) KeyError: "None of [Index(['term', 'int_rate', 'revol_util', 'emp_length'], dtype='object')] are in the [columns]"
df.memory_usage(deep=True).sum() / (1024 * 1024)
3.560114860534668
We have reduced by 1 MB the data by simply converting to date time and numeric. Now we will categorize data into categories if unique values are less than 20.
check_cat = ["grade", "sub_grade", "home_ownership", "verification_status", "loan_status",
"pymnt_plan", "purpose", "addr_state", "initial_list_status", "application_type"]
num_unique = {}
path = r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv"
for cat in check_cat:
test1 = pd.read_csv(path, usecols=[cat])
num_unique[cat] = df[cat].nunique()
test = pd.DataFrame(num_unique.items(), columns=["Col_name", "Number Unique"])
test
Col_name | Number Unique | |
---|---|---|
0 | grade | 7 |
1 | sub_grade | 35 |
2 | home_ownership | 3 |
3 | verification_status | 3 |
4 | loan_status | 6 |
5 | pymnt_plan | 1 |
6 | purpose | 13 |
7 | addr_state | 43 |
8 | initial_list_status | 1 |
9 | application_type | 1 |
Except for addr_state, the rest will be converted into categories for memory optimization
check_cat = ["grade", "sub_grade", "home_ownership", "verification_status", "loan_status",
"pymnt_plan", "purpose", "initial_list_status", "application_type"]
for cat_col in check_cat:
df[cat_col] = df[cat_col].astype("category")
def memory_usage(df):
return str(round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2)) + " MB"
memory_usage(df)
'1.83 MB'
df.select_dtypes(include=["integer", "float"]).columns
to_integer = ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
'term months', 'installment', 'emp_length years',
'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
'tax_liens']
df[to_integer] = df[to_integer].apply(pd.to_numeric, downcast="integer")
df.select_dtypes(include="integer")
types = df.dtypes.to_frame()
types = types[types[0] != "datetime64[ns]"]
types
0 | |
---|---|
id | int32 |
member_id | int32 |
loan_amnt | int32 |
funded_amnt | int32 |
funded_amnt_inv | float64 |
term months | int8 |
int_rate % | int64 |
installment | float64 |
grade | category |
sub_grade | category |
emp_title | object |
emp_length years | float64 |
home_ownership | category |
annual_inc | float64 |
verification_status | category |
loan_status | category |
pymnt_plan | category |
purpose | category |
title | object |
zip_code | object |
addr_state | object |
dti | float64 |
delinq_2yrs | int8 |
inq_last_6mths | int8 |
open_acc | int8 |
pub_rec | int8 |
revol_bal | int32 |
revol_util% | int8 |
total_acc | int8 |
initial_list_status | category |
out_prncp | float64 |
out_prncp_inv | float64 |
total_pymnt | float64 |
total_pymnt_inv | float64 |
total_rec_prncp | float64 |
total_rec_int | float64 |
total_rec_late_fee | float64 |
recoveries | float64 |
collection_recovery_fee | float64 |
last_pymnt_amnt | float64 |
collections_12_mths_ex_med | int8 |
policy_code | int8 |
application_type | category |
acc_now_delinq | int8 |
chargeoff_within_12_mths | int8 |
delinq_amnt | int8 |
pub_rec_bankruptcies | int8 |
tax_liens | int8 |
We reduced from 5 MB to 1.83 MB 1000 rows memory usage.
Now read all the dataframe by cleaning each chunk and adapting the optimal data types.
def extract_digits(col):
col = col.str.extract(r"(\d+)", expand=False)
return col
def chunk_processing(path, chunksize=5000):
#Clean those columns
cols_digits = ["term", "int_rate", "revol_util", "emp_length"]
#Parse as dates
cols_date = ["issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d"]
#Convert to integer
numerical = ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
'term months', 'installment', 'emp_length years',
'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
'tax_liens']
#Convert to categorical
check_cat = ["grade", "sub_grade", "home_ownership", "verification_status", "loan_status",
"pymnt_plan", "purpose", "initial_list_status", "application_type"]
#Rename columns
renaming = {"term":"term months", "int_rate":"int_rate %", "revol_util": "revol_util%", "emp_length":"emp_length years"}
#Category dictionary
categorical = {}
for cat in check_cat:
categorical[cat] = "category"
chunk_iter = pd.read_csv(path, chunksize=chunksize, parse_dates=cols_date, dtype=categorical)
final_chunk = pd.DataFrame()
for chunk in chunk_iter:
chunk[cols_digits] = chunk[cols_digits].apply(lambda x: extract_digits(x)).apply(pd.to_numeric)
chunk = chunk.rename(columns=renaming)
chunk[numerical] = chunk[numerical].apply(pd.to_numeric, errors="coerce")
if len(final_chunk) == 0:
final_chunk = chunk
else:
final_chunk = pd.concat([final_chunk, chunk])
return final_chunk
optimized = memory_usage(chunk_processing(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv", 10000))
non_optimized = memory_usage(pd.read_csv(r"/Users/matteo/Desktop/Python Projects & Notes/loans_2007.csv"))
print("Optimized:", optimized, "//", "Non-optimized", non_optimized)
Optimized: 34.33 MB // Non-optimized 66.44 MB