#!/usr/bin/env python
# coding: utf-8

# # Data Prep
# 
# For this vignette, we'll be using acquisition loan data from [Fannie Mae's public datasets](https://loanperformancedata.fanniemae.com/lppub/index.html#Portfolio). There is sadly not yet a Python script to facilitate the reading in of all the data—which are stored as pipe-delimited .txt files, released quarterly on ~1 year lag—so I had to infer some of my data cleaning steps from the [R code that Fannie made available](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_r_Primary.zip) to download and clean the data.
# 
# If you are not immediately familiar with these data (or mortgage finance-related data in general), I highly recommend referencing the 1st table—"Acquisition File Layout"—in [this dictionnary file here](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_File_layout.pdf), along with the [corresponding glossary here](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_Glossary.pdf), which provides an explanation of the meaning of the values for each variable.
# 
# For our purposes here, I will only be using acquisition data from the 1st and 4th quarter of 2017; thus, I will need to read in 2 .txt files. The code below is generalizable up to however many text files you want to use; you need only change the path_to_data variable to point at whatever path stores your .txt files.
# 

# ## Setup

# In[1]:


# basic packages
import numpy as np
import pandas as pd
import datetime

# for data importing
import os
import csv

# for data cleaning
from janitor import clean_names, remove_empty


# In[2]:


# store the datetime of the most recent running of this notebook as a form of a log
most_recent_run_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
f"This notebook was last executed on {most_recent_run_datetime}"


# ## Reading in the Data

# In[3]:


# location of my .txt data files
path_to_data = '../data/'

# instantiate an empty list to store the file names to read in
filelist = []

# loop over each filename in the specified directory
for filename in os.listdir(path_to_data):
    # check if the file path corresponds to a file
    # ensure the file in question is a .txt file
    # ensure we haven't already added that file to the list
    if os.path.isfile(path_to_data + filename) \
        and filename.endswith(".txt") \
        and filename not in filelist:
            # append the file to our list of files
            filelist.append(path_to_data+filename)


# In[4]:


# take a look at the files we'll be reading in
filelist


# In[5]:


# instantiate an empty df that we'll use to store all our data
acq_df = pd.DataFrame()

# list of variable names for all the fields in the .txt files (adapted from the aforementioned docs)
acq_var_names = ['LOAN_ID', 'ORIG_CHN', 'Seller.Name', 'ORIG_RT', 'ORIG_AMT', 'ORIG_TRM', 'ORIG_DTE','FRST_DTE',
                 'OLTV', 'OCLTV', 'NUM_BO', 'DTI', 'CSCORE_B', 'FTHB_FLG', 'PURPOSE', 'PROP_TYP', 'NUM_UNIT', 
                 'OCC_STAT', 'STATE', 'ZIP_3', 'MI_PCT', 'Product.Type', 'CSCORE_C', 'MI_TYPE', 'RELOCATION_FLG']

# loop over the .txt files, read them in, and append them to make our master acquisitions df
for f in filelist:
    # specify that our delimiter is a pipe, ignore the header, and use pre-specified variable names
    temp_df = pd.read_csv(filepath_or_buffer = f, sep="|", header=None, names=acq_var_names)
    # ensure that concatenation is row-wise and ignore the index values as they don't convey meaning here
    acq_df = pd.concat(objs=[acq_df, temp_df], axis=0, ignore_index=True)


# In[6]:


# taking a look at the data structure we have so far
acq_df.head()


# ## Cleaning the Data
# 
# For the most part, our data here are pretty clean. In the section below, we'll just make a few convenience changes, execute a couple checks, and create a few new variables.

# In[7]:


# use pyjanitor package to take care of basic data cleaning
acq_df = (
    acq_df
    # clean the column names, remove any leading/trailing underscores
    .clean_names(strip_underscores=True)
    # remove any rows that are entirely NA
    .remove_empty()
)


# In[8]:


# create a few new fields as recommended by the aformentioned docs

# find minimum credit score of borrower and co-borrower
acq_df['cscore_min'] = (
    acq_df[['cscore_b','cscore_c']].min(axis=1)
)

# find origination value = origination amount / origination loan-to-value ratio
acq_df['orig_val'] = (
    acq_df['orig_amt'] / (acq_df['oltv']/100)
)

# check if the ocltv is null; if it is, set it to the oltv
acq_df['ocltv'] = (
    np.where(acq_df['ocltv'].isnull(), acq_df['oltv'], acq_df['ocltv'])
)


# In[9]:


# inspect our final cleaned data
acq_df.head()


# # Final Data Trimming
# 
# For the sake of size / efficiency, I am going to create a dataset composed of only loans originated in two months: January 2017 and December 2017. 

# In[10]:


# filter original acq_df to just loans with origination dates in jan or dec 2017
jan_and_dec_17_acqs = acq_df.loc[
    (acq_df['orig_dte'] == '01/2017') | (acq_df['orig_dte'] == '12/2017')
]

# inspect the features of the resulting dataset
row_count, column_count = jan_and_dec_17_acqs.shape
f"The final dataset filtered to just Jan2017 and Dec2017 originations has {row_count} rows and {column_count} columns."


# In[11]:


# lastly, we'll save out this dataset for use elsewhere
jan_and_dec_17_acqs.to_csv(path_or_buf='../data/jan_and_dec_17_acqs.csv', index=False)