#!/usr/bin/env python # coding: utf-8 # # Choropleth map and more analysis of CFPB consumer complaint data, downloaded on January 30, 2019. # # Exploring the CFPB data. # # Data Source: https://www.consumerfinance.gov/data-research/consumer-complaints/ # In[1]: # import the CFPB data and 3 digit zip code data import pandas as pd import numpy as np df_cfpb = pd.read_csv('Consumer_Complaints.csv',low_memory=False) # In[2]: df_zip = pd.read_excel('zipcode.xlsx') # using 1st sheet for dataframe # In[3]: # 1 line of CFPB data df_cfpb.head(1) # In[4]: # 1st 5 of zipcode data df_zip.head() # In[5]: df_cfpb.columns # In[6]: # keep only the columns we need in CFPB data col_to_drop = ['Consumer complaint narrative','Company public response','Tags','Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?'] df = df_cfpb.drop(col_to_drop,axis=1).copy() # In[7]: # change dates to datetime values and create year grouping from datetime import datetime df['Date'] = df['Date received'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y')) df['Year'] = df['Date'].apply(lambda x: x.strftime('%Y')).copy() # In[8]: # keep only 2018 data and banks def company_type(x): if x == 'EQUIFAX, INC.': return 'Credit Union' elif x == 'Experian Information Solutions Inc.': return 'Credit Union' elif x == 'TRANSUNION INTERMEDIATE HOLDINGS, INC.': return 'Credit Union' else: return 'Bank' df['Type'] = df['Company'].apply(company_type) df_bb = df[(df['Type']=='Bank') & (df['Year'] == '2018')].copy() # In[9]: # Sort dates and reset index df_bb.sort_values(by='Date',ascending=True,inplace=True) df_bb.reset_index(inplace=True,drop=True) # In[10]: # create common zip code column for both dataframes df_bb['ZIP code'] = df_bb['ZIP code'].astype(str) df_bb['zip3'] = df_bb['ZIP code'].apply(lambda x: x[:3]) df_zip['prefix'] = df_zip['prefix'].astype(str) df_zip['zip3'] = df_zip['prefix'].apply(lambda x: x[:3]) # In[11]: # check if there is a new zip3 column df_bb.head(1) # In[12]: # set zip code file index to zip3 column just created and reset index for later mapping df_zip.set_index('zip3',inplace=True) df_zip.reset_index(inplace=True) # In[13]: df_zip.head(1) # In[14]: # we don't need the prefix column df_zip.drop(columns='prefix',inplace=True) # In[15]: # map destination to zip code mapping = dict(df_zip[['zip3','3‑Digit Destinations']].values) df_bb['loc'] = df_bb.zip3.map(mapping) # In[16]: # check if mapping worked df_bb.head(3) # In[17]: # how many nan data points in loc column df_bb[df_bb['loc'].isnull()]['Complaint ID'].count() # INFORMATION GATHERING # In[18]: # Top 5 metros with the most complaints df_bb['loc'].value_counts().head() # In[19]: # Top 5 companies with the most complaints df_bb['Company'].value_counts().head() # In[20]: # Top 5 zip with the most complaints df_bb[df_bb['loc'].notnull()]['ZIP code'].value_counts().head() # In[21]: # Top 5 states with the most complaints df_bb['State'].value_counts().head() # Examine the top issues for each product and other groupings # In[22]: df_bb['Product'].value_counts() # In[23]: df_bb[df_bb['Product'] == 'Mortgage']['Issue'].value_counts().head() # In[24]: df_bb[df_bb['Product'] == 'Credit card or prepaid card']['Issue'].value_counts().head() # In[25]: df_bb[df_bb['Product'] == 'Debt collection']['Issue'].value_counts().head() # In[26]: df_bb[df_bb['Product'] == 'Credit reporting, credit repair services, or other personal consumer reports' ]['Issue'].value_counts().head() # In[27]: df_bb[df_bb['Product'] == 'Checking or savings account']['Issue'].value_counts().head() # In[28]: # top companies that have complaints managing an account df_bb[df_bb['Issue'] == 'Managing an account']['Company'].value_counts().head() # In[29]: #top locations for fees or interest complaints df_bb[df_bb['Issue'] == 'Fees or interest']['loc'].value_counts().head() # In[30]: # Locations where struggling to pay mortgage is a complaint df_bb[df_bb['Issue'] == 'Struggling to pay mortgage']['loc'].value_counts().head() # # Data Visualization # In[31]: # import plot libraries for visualizations import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') get_ipython().run_line_magic('matplotlib', 'inline') # In[63]: # create new dataframe with only mortgage product complaints for 2018 df_mort = df_bb[df_bb['Product'] == 'Mortgage'].copy() # In[64]: # dataframe of total mortgage complaints by state compl_by_state = pd.pivot_table(df_mort,index=['State'], values=['Complaint ID'],aggfunc = len) # In[65]: compl_by_state.head() # In[66]: compl_by_state.reset_index(inplace=True) # In[67]: # create pivot table at state and issue to figure out top issue for mortgages in that state top_issu = pd.pivot_table(df_mort,index=['State','Issue'], values=['Complaint ID'],aggfunc = len) # In[68]: top_issu = top_issu['Complaint ID'].unstack(0).T # In[69]: # Find issue by highest # of complaints top_issu['TopIss'] = top_issu.idxmax(axis=1) # In[70]: # find # of complaints for TopIss top_issu['Max'] = top_issu.max(axis=1) # In[71]: top_issu.head() # In[72]: top_issu.reset_index(inplace=True) # In[73]: top_issu.columns # In[74]: # add new column - tot # of mortgage product complaints by state data_tot_compl = dict(compl_by_state[['State','Complaint ID']].values) top_issu['totCompl'] = top_issu.State.map(data_tot_compl) # In[75]: top_issu.sort_values(by='totCompl',inplace=True,ascending=False) # In[76]: top_issu['MaxPerc'] = top_issu['Max'] / top_issu['totCompl'] * 100 # In[77]: top_issu.sort_values(by='totCompl',inplace=True,ascending=False) top_issu.reset_index(drop=True,inplace=True) # In[78]: # check if we added new columns correctly of total complaints for each state and top issue as % of total compl top_issu.head() # In[79]: # create top location for mortgage complaints top_loc = pd.pivot_table(df_mort,index=['State','loc'], values=['Complaint ID'],aggfunc = len) # In[80]: top_loc = top_loc['Complaint ID'].unstack(0).T # Find loc by highest # of complaints top_loc['Toploc'] = top_loc.idxmax(axis=1) # find # of complaints for Top location top_loc['Max'] = top_loc.max(axis=1) # In[81]: top_loc.reset_index(inplace=True) # In[82]: top_loc.head() # In[83]: top_loc.columns # In[84]: # create new dataframe with 1st, 2nd to last and last columns location = top_loc.iloc[:,[0,-2]] # In[85]: location.head(2) # In[86]: chart = top_issu.iloc[:,[0,-4,-3,-2,-1]].copy() # In[87]: chart.head() # In[88]: # add new column - top location dict_toploc = dict(location[['State','Toploc']].values) chart['toploc'] = chart.State.map(dict_toploc) # In[99]: # Grab US states for chart chart_geo = chart.iloc[:52,:].copy() # In[100]: # need to remove Puerto Rico chart_geo_50 = chart_geo[chart_geo['State'] != 'PR'].copy() # In[101]: chart_geo_50.info() # In[102]: pd.options.display.float_format = '{:.1f}'.format chart_geo_50.head() # In[103]: # create string % chart_geo_50['chperc'] = chart_geo_50['MaxPerc'].apply(lambda x: '{0:.1f}%'.format(x)) # In[104]: chart_geo_50.head() # In[105]: # create column for text in each state chart_geo_50['text'] = chart_geo_50.iloc[:,1] + ', '+ chart_geo_50.iloc[:,-1]+', '+ chart_geo_50.iloc[:,-2] # In[106]: # rename columns for clarity chart_geo_50.rename(columns={'TopIss': 'Top Issue', 'totCompl': 'Total Complaints'}, inplace=True) # In[107]: chart_geo_50.head() # In[108]: import plotly.plotly as py import plotly.graph_objs as go from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # In[109]: init_notebook_mode(connected=True) # # Create the Choropleth Map # In[110]: data = dict(type='choropleth', colorscale = 'Viridis', reversescale = True, locations = chart_geo_50['State'], z = chart_geo_50['Total Complaints'], locationmode = 'USA-states', text = chart_geo_50['text'], marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)), colorbar = {'title':"Total Mortgage Product Complaints"} ) # In[111]: layout = dict(title = 'CFPB Total Mortgage Product Complaints by State in 2018', geo = dict(scope='usa', showlakes = True, lakecolor = 'rgb(85,173,240)') ) # In[112]: choromap = go.Figure(data = [data],layout = layout) #plot(choromap,validate=False) iplot(choromap,show_link=True) # # BUBBLE CHART # In[113]: mort_loc = df_mort.groupby(['loc']).agg({'Complaint ID': pd.Series.nunique}) # In[114]: # dataframe with loc and mortgage complaints counts mort_loc.head() # In[115]: # Sort values and rank mort_loc.sort_values(by='Complaint ID',ascending=False, inplace=True) mort_loc['Rank'] = mort_loc['Complaint ID'].rank(ascending=False) # In[116]: mort_loc.rename(columns={'Complaint ID': '# of Complaints'}, inplace=True) mort_loc.head() # In[117]: # reset index so loc is a column mort_loc.reset_index(inplace=True) # In[118]: # create the dataframe to calc % of complaints: Struggling to pay mortgage mort_trou = df_mort[df_mort['Issue'] == 'Struggling to pay mortgage' ].groupby(['loc']).agg({'Complaint ID': pd.Series.nunique}) # In[119]: # rename columns, sort values, reset index mort_trou.rename(columns={'Complaint ID': 'StrugToPay'}, inplace=True) mort_trou.sort_values(by='StrugToPay',ascending=False, inplace=True) mort_trou.reset_index(inplace=True) # In[120]: # merge data frames new_df = pd.merge(mort_loc,mort_trou,how='left',on='loc') # In[121]: new_df.head() # In[122]: # create new column and format for chart new_df['StrugPay%'] = new_df['StrugToPay'] / new_df['# of Complaints'] * 100 pd.options.display.float_format = '{:.1f}'.format # In[123]: # change rank column to integers new_df.Rank = new_df.Rank.astype(int) # In[124]: # only need top 15 for chart new_15 = new_df.iloc[:15,:].set_index('loc') # In[125]: new_15 # In[126]: # bubble chart Top15 = new_15 #fig, ax = plt.subplots() ax = Top15.plot(x='Rank', y='StrugPay%', kind='scatter', c=['#e41a1c','#377eb8','#e41a1c','#4daf4a','#4daf4a','#377eb8','#4daf4a','#e41a1c', '#4daf4a','#e41a1c','#4daf4a','#4daf4a','#e41a1c','#dede00','#ff7f00'], xticks=range(1,16), s=(5*Top15['# of Complaints']/100)**3, alpha=.75, figsize=[16,6]); for i, txt in enumerate(Top15.index): ax.annotate(txt, [Top15['Rank'][i], Top15['StrugPay%'][i]], ha='center') plt.savefig('bubble.jpg',bbox_inches="tight") print("Rank is by # of Total Mortgage product complaints. Y-axis is the % of those complaints which had issue: Struggling to pay mortgage") # In[ ]: