#!/usr/bin/env python
# coding: utf-8

# # Choropleth map and more analysis of CFPB consumer complaint data, downloaded on January 30, 2019.
# 
# Exploring the CFPB data.
# 
# Data Source: https://www.consumerfinance.gov/data-research/consumer-complaints/

# In[1]:


# import the CFPB data and 3 digit zip code data
import pandas as pd
import numpy as np
df_cfpb = pd.read_csv('Consumer_Complaints.csv',low_memory=False)


# In[2]:


df_zip = pd.read_excel('zipcode.xlsx')  # using 1st sheet for dataframe


# In[3]:


# 1 line of CFPB data
df_cfpb.head(1)


# In[4]:


# 1st 5 of zipcode data
df_zip.head()


# In[5]:


df_cfpb.columns


# In[6]:


# keep only the columns we need in CFPB data
col_to_drop = ['Consumer complaint narrative','Company public response','Tags','Consumer consent provided?',
              'Submitted via', 'Date sent to company', 'Company response to consumer',
              'Timely response?', 'Consumer disputed?']
df = df_cfpb.drop(col_to_drop,axis=1).copy()


# In[7]:


# change dates to datetime values and create year grouping
from datetime import datetime
df['Date'] = df['Date received'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Year'] = df['Date'].apply(lambda x: x.strftime('%Y')).copy()


# In[8]:


# keep only 2018 data and banks
def company_type(x):
    if x == 'EQUIFAX, INC.':
        return 'Credit Union'
    elif x == 'Experian Information Solutions Inc.':
        return 'Credit Union'
    elif x == 'TRANSUNION INTERMEDIATE HOLDINGS, INC.':
        return 'Credit Union'
    else:
        return 'Bank'
df['Type'] = df['Company'].apply(company_type)
df_bb = df[(df['Type']=='Bank') & (df['Year'] == '2018')].copy()


# In[9]:


# Sort dates and reset index
df_bb.sort_values(by='Date',ascending=True,inplace=True)
df_bb.reset_index(inplace=True,drop=True)


# In[10]:


# create common zip code column for both dataframes
df_bb['ZIP code'] = df_bb['ZIP code'].astype(str)
df_bb['zip3'] = df_bb['ZIP code'].apply(lambda x: x[:3])
df_zip['prefix'] = df_zip['prefix'].astype(str)
df_zip['zip3'] = df_zip['prefix'].apply(lambda x: x[:3])


# In[11]:


# check if there is a new zip3 column
df_bb.head(1)


# In[12]:


# set zip code file index to zip3 column just created and reset index for later mapping 
df_zip.set_index('zip3',inplace=True)
df_zip.reset_index(inplace=True)


# In[13]:


df_zip.head(1)


# In[14]:


# we don't need the prefix column
df_zip.drop(columns='prefix',inplace=True)


# In[15]:


# map destination to zip code
mapping = dict(df_zip[['zip3','3‑Digit Destinations']].values)
df_bb['loc'] = df_bb.zip3.map(mapping)


# In[16]:


# check if mapping worked
df_bb.head(3)


# In[17]:


# how many nan data points in loc column
df_bb[df_bb['loc'].isnull()]['Complaint ID'].count()       


# INFORMATION GATHERING

# In[18]:


# Top 5 metros with the most complaints 
df_bb['loc'].value_counts().head()


# In[19]:


# Top 5 companies with the most complaints 
df_bb['Company'].value_counts().head()


# In[20]:


# Top 5 zip with the most complaints 
df_bb[df_bb['loc'].notnull()]['ZIP code'].value_counts().head()


# In[21]:


# Top 5 states with the most complaints 
df_bb['State'].value_counts().head()


# Examine the top issues for each product and other groupings

# In[22]:


df_bb['Product'].value_counts()


# In[23]:


df_bb[df_bb['Product'] == 'Mortgage']['Issue'].value_counts().head()


# In[24]:


df_bb[df_bb['Product'] == 'Credit card or prepaid card']['Issue'].value_counts().head()


# In[25]:


df_bb[df_bb['Product'] == 'Debt collection']['Issue'].value_counts().head()


# In[26]:


df_bb[df_bb['Product'] == 'Credit reporting, credit repair services, or other personal consumer reports'
     ]['Issue'].value_counts().head()


# In[27]:


df_bb[df_bb['Product'] == 'Checking or savings account']['Issue'].value_counts().head()


# In[28]:


# top companies that have complaints managing an account
df_bb[df_bb['Issue'] == 'Managing an account']['Company'].value_counts().head()


# In[29]:


#top locations for fees or interest complaints
df_bb[df_bb['Issue'] == 'Fees or interest']['loc'].value_counts().head()


# In[30]:


# Locations where struggling to pay mortgage is a complaint
df_bb[df_bb['Issue'] == 'Struggling to pay mortgage']['loc'].value_counts().head()


# # Data Visualization

# In[31]:


# import plot libraries for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
get_ipython().run_line_magic('matplotlib', 'inline')


# In[63]:


# create new dataframe with only mortgage product complaints for 2018
df_mort = df_bb[df_bb['Product'] == 'Mortgage'].copy()


# In[64]:


# dataframe of total mortgage complaints by state
compl_by_state = pd.pivot_table(df_mort,index=['State'], values=['Complaint ID'],aggfunc = len)


# In[65]:


compl_by_state.head()


# In[66]:


compl_by_state.reset_index(inplace=True)


# In[67]:


# create pivot table at state and issue to figure out top issue for mortgages in that state
top_issu = pd.pivot_table(df_mort,index=['State','Issue'], values=['Complaint ID'],aggfunc = len)


# In[68]:


top_issu = top_issu['Complaint ID'].unstack(0).T


# In[69]:


# Find issue by highest # of complaints
top_issu['TopIss'] = top_issu.idxmax(axis=1)


# In[70]:


# find # of complaints for TopIss
top_issu['Max'] = top_issu.max(axis=1)


# In[71]:


top_issu.head()


# In[72]:


top_issu.reset_index(inplace=True)


# In[73]:


top_issu.columns


# In[74]:


# add new column - tot # of mortgage product complaints by state
data_tot_compl = dict(compl_by_state[['State','Complaint ID']].values)
top_issu['totCompl'] = top_issu.State.map(data_tot_compl)


# In[75]:


top_issu.sort_values(by='totCompl',inplace=True,ascending=False)


# In[76]:


top_issu['MaxPerc'] = top_issu['Max'] / top_issu['totCompl'] * 100


# In[77]:


top_issu.sort_values(by='totCompl',inplace=True,ascending=False)
top_issu.reset_index(drop=True,inplace=True)


# In[78]:


# check if we added new columns correctly of total complaints for each state and top issue as % of total compl
top_issu.head()


# In[79]:


# create top location for mortgage complaints
top_loc = pd.pivot_table(df_mort,index=['State','loc'], values=['Complaint ID'],aggfunc = len)


# In[80]:


top_loc = top_loc['Complaint ID'].unstack(0).T
# Find loc by highest # of complaints
top_loc['Toploc'] = top_loc.idxmax(axis=1)
# find # of complaints for Top location
top_loc['Max'] = top_loc.max(axis=1)


# In[81]:


top_loc.reset_index(inplace=True)


# In[82]:


top_loc.head()


# In[83]:


top_loc.columns


# In[84]:


# create new dataframe with 1st, 2nd to last and last columns
location = top_loc.iloc[:,[0,-2]]


# In[85]:


location.head(2)


# In[86]:


chart = top_issu.iloc[:,[0,-4,-3,-2,-1]].copy()


# In[87]:


chart.head()


# In[88]:


# add new column - top location
dict_toploc = dict(location[['State','Toploc']].values)
chart['toploc'] = chart.State.map(dict_toploc)


# In[99]:


# Grab US states for chart
chart_geo = chart.iloc[:52,:].copy()


# In[100]:


# need to remove Puerto Rico
chart_geo_50 = chart_geo[chart_geo['State'] != 'PR'].copy()


# In[101]:


chart_geo_50.info()


# In[102]:


pd.options.display.float_format = '{:.1f}'.format
chart_geo_50.head()


# In[103]:


# create string %
chart_geo_50['chperc'] = chart_geo_50['MaxPerc'].apply(lambda x: '{0:.1f}%'.format(x))


# In[104]:


chart_geo_50.head()


# In[105]:


# create column for text in each state
chart_geo_50['text'] = chart_geo_50.iloc[:,1] + ', '+ chart_geo_50.iloc[:,-1]+', '+ chart_geo_50.iloc[:,-2] 


# In[106]:


# rename columns for clarity
chart_geo_50.rename(columns={'TopIss': 'Top Issue', 'totCompl': 'Total Complaints'}, inplace=True)


# In[107]:


chart_geo_50.head()


# In[108]:


import plotly.plotly as py
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


# In[109]:


init_notebook_mode(connected=True) 


# # Create the Choropleth Map

# In[110]:


data = dict(type='choropleth',
            colorscale = 'Viridis',
            reversescale = True,
            locations = chart_geo_50['State'],
            z = chart_geo_50['Total Complaints'],
            locationmode = 'USA-states',
            text = chart_geo_50['text'],
            marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
            colorbar = {'title':"Total Mortgage Product Complaints"}
            ) 


# In[111]:


layout = dict(title = 'CFPB Total Mortgage Product Complaints by State in 2018',
              geo = dict(scope='usa',
                         showlakes = True,
                         lakecolor = 'rgb(85,173,240)')
             )


# In[112]:


choromap = go.Figure(data = [data],layout = layout)
#plot(choromap,validate=False)
iplot(choromap,show_link=True)


# # BUBBLE CHART

# In[113]:


mort_loc = df_mort.groupby(['loc']).agg({'Complaint ID': pd.Series.nunique})


# In[114]:


# dataframe with loc and mortgage complaints counts
mort_loc.head()


# In[115]:


# Sort values and rank
mort_loc.sort_values(by='Complaint ID',ascending=False, inplace=True)
mort_loc['Rank'] = mort_loc['Complaint ID'].rank(ascending=False)


# In[116]:


mort_loc.rename(columns={'Complaint ID': '# of Complaints'}, inplace=True)
mort_loc.head()


# In[117]:


# reset index so loc is a column
mort_loc.reset_index(inplace=True)


# In[118]:


# create the dataframe to calc % of complaints: Struggling to pay mortgage
mort_trou = df_mort[df_mort['Issue'] == 'Struggling to pay mortgage'
                 ].groupby(['loc']).agg({'Complaint ID': pd.Series.nunique})


# In[119]:


# rename columns, sort values, reset index
mort_trou.rename(columns={'Complaint ID': 'StrugToPay'}, inplace=True)
mort_trou.sort_values(by='StrugToPay',ascending=False, inplace=True)
mort_trou.reset_index(inplace=True)


# In[120]:


# merge data frames 
new_df = pd.merge(mort_loc,mort_trou,how='left',on='loc')


# In[121]:


new_df.head()


# In[122]:


# create new column and format for chart
new_df['StrugPay%'] = new_df['StrugToPay'] / new_df['# of Complaints'] * 100
pd.options.display.float_format = '{:.1f}'.format


# In[123]:


# change rank column to integers
new_df.Rank = new_df.Rank.astype(int)


# In[124]:


# only need top 15 for chart
new_15 = new_df.iloc[:15,:].set_index('loc')


# In[125]:


new_15


# In[126]:


# bubble chart
Top15 = new_15
#fig, ax = plt.subplots()
ax = Top15.plot(x='Rank', y='StrugPay%', kind='scatter',
                c=['#e41a1c','#377eb8','#e41a1c','#4daf4a','#4daf4a','#377eb8','#4daf4a','#e41a1c',
                   '#4daf4a','#e41a1c','#4daf4a','#4daf4a','#e41a1c','#dede00','#ff7f00'],
                xticks=range(1,16), s=(5*Top15['# of Complaints']/100)**3, alpha=.75, figsize=[16,6]);
for i, txt in enumerate(Top15.index):
    ax.annotate(txt, [Top15['Rank'][i], Top15['StrugPay%'][i]], ha='center')
plt.savefig('bubble.jpg',bbox_inches="tight")
print("Rank is by # of Total Mortgage product complaints. Y-axis is the % of those complaints which had issue: Struggling to pay mortgage")


# In[ ]: