#!/usr/bin/env python # coding: utf-8 # # Data Prep # # For this vignette, we'll be using acquisition loan data from [Fannie Mae's public datasets](https://loanperformancedata.fanniemae.com/lppub/index.html#Portfolio). There is sadly not yet a Python script to facilitate the reading in of all the data—which are stored as pipe-delimited .txt files, released quarterly on ~1 year lag—so I had to infer some of my data cleaning steps from the [R code that Fannie made available](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_r_Primary.zip) to download and clean the data. # # If you are not immediately familiar with these data (or mortgage finance-related data in general), I highly recommend referencing the 1st table—"Acquisition File Layout"—in [this dictionnary file here](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_File_layout.pdf), along with the [corresponding glossary here](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Glossary.pdf), which provides an explanation of the meaning of the values for each variable. # # For our purposes here, I will only be using acquisition data from the 1st and 4th quarter of 2017; thus, I will need to read in 2 .txt files. The code below is generalizable up to however many text files you want to use; you need only change the path_to_data variable to point at whatever path stores your .txt files. # # ## Setup # In[1]: # basic packages import numpy as np import pandas as pd import datetime # for data importing import os import csv # for data cleaning from janitor import clean_names, remove_empty # In[2]: # store the datetime of the most recent running of this notebook as a form of a log most_recent_run_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") f"This notebook was last executed on {most_recent_run_datetime}" # ## Reading in the Data # In[3]: # location of my .txt data files path_to_data = '../data/' # instantiate an empty list to store the file names to read in filelist = [] # loop over each filename in the specified directory for filename in os.listdir(path_to_data): # check if the file path corresponds to a file # ensure the file in question is a .txt file # ensure we haven't already added that file to the list if os.path.isfile(path_to_data + filename) \ and filename.endswith(".txt") \ and filename not in filelist: # append the file to our list of files filelist.append(path_to_data+filename) # In[4]: # take a look at the files we'll be reading in filelist # In[5]: # instantiate an empty df that we'll use to store all our data acq_df = pd.DataFrame() # list of variable names for all the fields in the .txt files (adapted from the aforementioned docs) acq_var_names = ['LOAN_ID', 'ORIG_CHN', 'Seller.Name', 'ORIG_RT', 'ORIG_AMT', 'ORIG_TRM', 'ORIG_DTE','FRST_DTE', 'OLTV', 'OCLTV', 'NUM_BO', 'DTI', 'CSCORE_B', 'FTHB_FLG', 'PURPOSE', 'PROP_TYP', 'NUM_UNIT', 'OCC_STAT', 'STATE', 'ZIP_3', 'MI_PCT', 'Product.Type', 'CSCORE_C', 'MI_TYPE', 'RELOCATION_FLG'] # loop over the .txt files, read them in, and append them to make our master acquisitions df for f in filelist: # specify that our delimiter is a pipe, ignore the header, and use pre-specified variable names temp_df = pd.read_csv(filepath_or_buffer = f, sep="|", header=None, names=acq_var_names) # ensure that concatenation is row-wise and ignore the index values as they don't convey meaning here acq_df = pd.concat(objs=[acq_df, temp_df], axis=0, ignore_index=True) # In[6]: # taking a look at the data structure we have so far acq_df.head() # ## Cleaning the Data # # For the most part, our data here are pretty clean. In the section below, we'll just make a few convenience changes, execute a couple checks, and create a few new variables. # In[7]: # use pyjanitor package to take care of basic data cleaning acq_df = ( acq_df # clean the column names, remove any leading/trailing underscores .clean_names(strip_underscores=True) # remove any rows that are entirely NA .remove_empty() ) # In[8]: # create a few new fields as recommended by the aformentioned docs # find minimum credit score of borrower and co-borrower acq_df['cscore_min'] = ( acq_df[['cscore_b','cscore_c']].min(axis=1) ) # find origination value = origination amount / origination loan-to-value ratio acq_df['orig_val'] = ( acq_df['orig_amt'] / (acq_df['oltv']/100) ) # check if the ocltv is null; if it is, set it to the oltv acq_df['ocltv'] = ( np.where(acq_df['ocltv'].isnull(), acq_df['oltv'], acq_df['ocltv']) ) # In[9]: # inspect our final cleaned data acq_df.head() # # Final Data Trimming # # For the sake of size / efficiency, I am going to create a dataset composed of only loans originated in two months: January 2017 and December 2017. # In[10]: # filter original acq_df to just loans with origination dates in jan or dec 2017 jan_and_dec_17_acqs = acq_df.loc[ (acq_df['orig_dte'] == '01/2017') | (acq_df['orig_dte'] == '12/2017') ] # inspect the features of the resulting dataset row_count, column_count = jan_and_dec_17_acqs.shape f"The final dataset filtered to just Jan2017 and Dec2017 originations has {row_count} rows and {column_count} columns." # In[11]: # lastly, we'll save out this dataset for use elsewhere jan_and_dec_17_acqs.to_csv(path_or_buf='../data/jan_and_dec_17_acqs.csv', index=False)