預測借款人是否會償還貸款
分類問題:0 為會償還貸款,1 為不會償還
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
pd.set_option('display.max_columns', 500)
import numpy as np # fundamental package for scientific computing with Python
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
# from plotly import tools
# import plotly.tools as tls
# import squarify
# from mpl_toolkits.basemap import Basemap
# from numpy import array
# from matplotlib import cm
# import cufflinks and offline mode
import cufflinks as cf
cf.go_offline()
import os
print(os.listdir("../input"))
['application_test.csv', 'HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'previous_application.csv', 'bureau_balance.csv', 'sample_submission.csv']
application_train = pd.read_csv('../input/application_train.csv')
POS_CASH_balance = pd.read_csv('../input/POS_CASH_balance.csv')
bureau_balance = pd.read_csv('../input/bureau_balance.csv')
previous_application = pd.read_csv('../input/previous_application.csv')
installments_payments = pd.read_csv('../input/installments_payments.csv')
credit_card_balance = pd.read_csv('../input/credit_card_balance.csv')
bureau = pd.read_csv('../input/bureau.csv')
application_test = pd.read_csv('../input/application_test.csv')
print('Size of application_train data', application_train.shape)
print('Size of POS_CASH_balance data', POS_CASH_balance.shape)
print('Size of bureau_balance data', bureau_balance.shape)
print('Size of previous_application data', previous_application.shape)
print('Size of installments_payments data', installments_payments.shape)
print('Size of credit_card_balance data', credit_card_balance.shape)
print('Size of bureau data', bureau.shape)
Size of application_train data (307511, 122) Size of POS_CASH_balance data (10001358, 8) Size of bureau_balance data (27299925, 3) Size of previous_application data (1670214, 37) Size of installments_payments data (13605401, 8) Size of credit_card_balance data (3840312, 23) Size of bureau data (1716428, 17)
application_train.head()
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
application_train.columns.values
array(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'], dtype=object)
POS_CASH_balance.head()
SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | CNT_INSTALMENT | CNT_INSTALMENT_FUTURE | NAME_CONTRACT_STATUS | SK_DPD | SK_DPD_DEF | |
---|---|---|---|---|---|---|---|---|
0 | 1803195 | 182943 | -31 | 48.0 | 45.0 | Active | 0 | 0 |
1 | 1715348 | 367990 | -33 | 36.0 | 35.0 | Active | 0 | 0 |
2 | 1784872 | 397406 | -32 | 12.0 | 9.0 | Active | 0 | 0 |
3 | 1903291 | 269225 | -35 | 48.0 | 42.0 | Active | 0 | 0 |
4 | 2341044 | 334279 | -35 | 36.0 | 35.0 | Active | 0 | 0 |
bureau_balance.head()
SK_ID_BUREAU | MONTHS_BALANCE | STATUS | |
---|---|---|---|
0 | 5715448 | 0 | C |
1 | 5715448 | -1 | C |
2 | 5715448 | -2 | C |
3 | 5715448 | -3 | C |
4 | 5715448 | -4 | C |
previous_application.head()
SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | Y | 1 | 0.0 | 0.182832 | 0.867336 | XAP | Approved | -73 | Cash through the bank | XAP | NaN | Repeater | Mobile | POS | XNA | Country-wide | 35 | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -164 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Contact center | -1 | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -301 | Cash through the bank | XAP | Spouse, partner | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -512 | Cash through the bank | XAP | NaN | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | Y | 1 | NaN | NaN | NaN | Repairs | Refused | -781 | Cash through the bank | HC | NaN | Repeater | XNA | Cash | walk-in | Credit and cash offices | -1 | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
installments_payments.head()
SK_ID_PREV | SK_ID_CURR | NUM_INSTALMENT_VERSION | NUM_INSTALMENT_NUMBER | DAYS_INSTALMENT | DAYS_ENTRY_PAYMENT | AMT_INSTALMENT | AMT_PAYMENT | |
---|---|---|---|---|---|---|---|---|
0 | 1054186 | 161674 | 1.0 | 6 | -1180.0 | -1187.0 | 6948.360 | 6948.360 |
1 | 1330831 | 151639 | 0.0 | 34 | -2156.0 | -2156.0 | 1716.525 | 1716.525 |
2 | 2085231 | 193053 | 2.0 | 1 | -63.0 | -63.0 | 25425.000 | 25425.000 |
3 | 2452527 | 199697 | 1.0 | 3 | -2418.0 | -2426.0 | 24350.130 | 24350.130 |
4 | 2714724 | 167756 | 1.0 | 2 | -1383.0 | -1366.0 | 2165.040 | 2160.585 |
credit_card_balance.head()
SK_ID_PREV | SK_ID_CURR | MONTHS_BALANCE | AMT_BALANCE | AMT_CREDIT_LIMIT_ACTUAL | AMT_DRAWINGS_ATM_CURRENT | AMT_DRAWINGS_CURRENT | AMT_DRAWINGS_OTHER_CURRENT | AMT_DRAWINGS_POS_CURRENT | AMT_INST_MIN_REGULARITY | AMT_PAYMENT_CURRENT | AMT_PAYMENT_TOTAL_CURRENT | AMT_RECEIVABLE_PRINCIPAL | AMT_RECIVABLE | AMT_TOTAL_RECEIVABLE | CNT_DRAWINGS_ATM_CURRENT | CNT_DRAWINGS_CURRENT | CNT_DRAWINGS_OTHER_CURRENT | CNT_DRAWINGS_POS_CURRENT | CNT_INSTALMENT_MATURE_CUM | NAME_CONTRACT_STATUS | SK_DPD | SK_DPD_DEF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2562384 | 378907 | -6 | 56.970 | 135000 | 0.0 | 877.5 | 0.0 | 877.5 | 1700.325 | 1800.0 | 1800.0 | 0.000 | 0.000 | 0.000 | 0.0 | 1 | 0.0 | 1.0 | 35.0 | Active | 0 | 0 |
1 | 2582071 | 363914 | -1 | 63975.555 | 45000 | 2250.0 | 2250.0 | 0.0 | 0.0 | 2250.000 | 2250.0 | 2250.0 | 60175.080 | 64875.555 | 64875.555 | 1.0 | 1 | 0.0 | 0.0 | 69.0 | Active | 0 | 0 |
2 | 1740877 | 371185 | -7 | 31815.225 | 450000 | 0.0 | 0.0 | 0.0 | 0.0 | 2250.000 | 2250.0 | 2250.0 | 26926.425 | 31460.085 | 31460.085 | 0.0 | 0 | 0.0 | 0.0 | 30.0 | Active | 0 | 0 |
3 | 1389973 | 337855 | -4 | 236572.110 | 225000 | 2250.0 | 2250.0 | 0.0 | 0.0 | 11795.760 | 11925.0 | 11925.0 | 224949.285 | 233048.970 | 233048.970 | 1.0 | 1 | 0.0 | 0.0 | 10.0 | Active | 0 | 0 |
4 | 1891521 | 126868 | -1 | 453919.455 | 450000 | 0.0 | 11547.0 | 0.0 | 11547.0 | 22924.890 | 27000.0 | 27000.0 | 443044.395 | 453919.455 | 453919.455 | 0.0 | 1 | 0.0 | 1.0 | 101.0 | Active | 0 | 0 |
credit_card_balance.columns.values
array(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM', 'NAME_CONTRACT_STATUS', 'SK_DPD', 'SK_DPD_DEF'], dtype=object)
bureau.head()
SK_ID_CURR | SK_ID_BUREAU | CREDIT_ACTIVE | CREDIT_CURRENCY | DAYS_CREDIT | CREDIT_DAY_OVERDUE | DAYS_CREDIT_ENDDATE | DAYS_ENDDATE_FACT | AMT_CREDIT_MAX_OVERDUE | CNT_CREDIT_PROLONG | AMT_CREDIT_SUM | AMT_CREDIT_SUM_DEBT | AMT_CREDIT_SUM_LIMIT | AMT_CREDIT_SUM_OVERDUE | CREDIT_TYPE | DAYS_CREDIT_UPDATE | AMT_ANNUITY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 215354 | 5714462 | Closed | currency 1 | -497 | 0 | -153.0 | -153.0 | NaN | 0 | 91323.0 | 0.0 | NaN | 0.0 | Consumer credit | -131 | NaN |
1 | 215354 | 5714463 | Active | currency 1 | -208 | 0 | 1075.0 | NaN | NaN | 0 | 225000.0 | 171342.0 | NaN | 0.0 | Credit card | -20 | NaN |
2 | 215354 | 5714464 | Active | currency 1 | -203 | 0 | 528.0 | NaN | NaN | 0 | 464323.5 | NaN | NaN | 0.0 | Consumer credit | -16 | NaN |
3 | 215354 | 5714465 | Active | currency 1 | -203 | 0 | NaN | NaN | NaN | 0 | 90000.0 | NaN | NaN | 0.0 | Credit card | -16 | NaN |
4 | 215354 | 5714466 | Active | currency 1 | -629 | 0 | 1197.0 | NaN | 77674.5 | 0 | 2700000.0 | NaN | NaN | 0.0 | Consumer credit | -21 | NaN |
查看每個資料每個欄位的遺漏值的佔比
# checking missing data
total = application_train.isnull().sum().sort_values(ascending = False)
percent = (application_train.isnull().sum()/application_train.isnull().count()*100).sort_values(ascending = False)
missing_application_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_train_data.head(20)
Total | Percent | |
---|---|---|
COMMONAREA_MEDI | 214865 | 69.872297 |
COMMONAREA_AVG | 214865 | 69.872297 |
COMMONAREA_MODE | 214865 | 69.872297 |
NONLIVINGAPARTMENTS_MODE | 213514 | 69.432963 |
NONLIVINGAPARTMENTS_MEDI | 213514 | 69.432963 |
NONLIVINGAPARTMENTS_AVG | 213514 | 69.432963 |
FONDKAPREMONT_MODE | 210295 | 68.386172 |
LIVINGAPARTMENTS_MEDI | 210199 | 68.354953 |
LIVINGAPARTMENTS_MODE | 210199 | 68.354953 |
LIVINGAPARTMENTS_AVG | 210199 | 68.354953 |
FLOORSMIN_MEDI | 208642 | 67.848630 |
FLOORSMIN_MODE | 208642 | 67.848630 |
FLOORSMIN_AVG | 208642 | 67.848630 |
YEARS_BUILD_MEDI | 204488 | 66.497784 |
YEARS_BUILD_AVG | 204488 | 66.497784 |
YEARS_BUILD_MODE | 204488 | 66.497784 |
OWN_CAR_AGE | 202929 | 65.990810 |
LANDAREA_MODE | 182590 | 59.376738 |
LANDAREA_AVG | 182590 | 59.376738 |
LANDAREA_MEDI | 182590 | 59.376738 |
# checking missing data
total = POS_CASH_balance.isnull().sum().sort_values(ascending = False)
percent = (POS_CASH_balance.isnull().sum()/POS_CASH_balance.isnull().count()*100).sort_values(ascending = False)
missing_POS_CASH_balance_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_POS_CASH_balance_data.head(3)
Total | Percent | |
---|---|---|
CNT_INSTALMENT_FUTURE | 26087 | 0.260835 |
CNT_INSTALMENT | 26071 | 0.260675 |
SK_DPD_DEF | 0 | 0.000000 |
# checking missing data
total = bureau_balance.isnull().sum().sort_values(ascending = False)
percent = (bureau_balance.isnull().sum()/bureau_balance.isnull().count()*100).sort_values(ascending = False)
missing_bureau_balance_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_bureau_balance_data.head(3)
Total | Percent | |
---|---|---|
STATUS | 0 | 0.0 |
MONTHS_BALANCE | 0 | 0.0 |
SK_ID_BUREAU | 0 | 0.0 |
# checking missing data
total = previous_application.isnull().sum().sort_values(ascending = False)
percent = (previous_application.isnull().sum()/previous_application.isnull().count()*100).sort_values(ascending = False)
missing_previous_application_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_previous_application_data.head(15)
Total | Percent | |
---|---|---|
RATE_INTEREST_PRIVILEGED | 1664263 | 99.643698 |
RATE_INTEREST_PRIMARY | 1664263 | 99.643698 |
RATE_DOWN_PAYMENT | 895844 | 53.636480 |
AMT_DOWN_PAYMENT | 895844 | 53.636480 |
NAME_TYPE_SUITE | 820405 | 49.119754 |
DAYS_TERMINATION | 673065 | 40.298129 |
NFLAG_INSURED_ON_APPROVAL | 673065 | 40.298129 |
DAYS_FIRST_DRAWING | 673065 | 40.298129 |
DAYS_FIRST_DUE | 673065 | 40.298129 |
DAYS_LAST_DUE_1ST_VERSION | 673065 | 40.298129 |
DAYS_LAST_DUE | 673065 | 40.298129 |
AMT_GOODS_PRICE | 385515 | 23.081773 |
AMT_ANNUITY | 372235 | 22.286665 |
CNT_PAYMENT | 372230 | 22.286366 |
PRODUCT_COMBINATION | 346 | 0.020716 |
# checking missing data
total = installments_payments.isnull().sum().sort_values(ascending = False)
percent = (installments_payments.isnull().sum()/installments_payments.isnull().count()*100).sort_values(ascending = False)
missing_installments_payments_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_installments_payments_data.head(3)
Total | Percent | |
---|---|---|
AMT_PAYMENT | 2905 | 0.021352 |
DAYS_ENTRY_PAYMENT | 2905 | 0.021352 |
AMT_INSTALMENT | 0 | 0.000000 |
# checking missing data
total = credit_card_balance.isnull().sum().sort_values(ascending = False)
percent = (credit_card_balance.isnull().sum()/credit_card_balance.isnull().count()*100).sort_values(ascending = False)
missing_credit_card_balance_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_credit_card_balance_data.head(10)
Total | Percent | |
---|---|---|
AMT_PAYMENT_CURRENT | 767988 | 19.998063 |
AMT_DRAWINGS_OTHER_CURRENT | 749816 | 19.524872 |
CNT_DRAWINGS_POS_CURRENT | 749816 | 19.524872 |
CNT_DRAWINGS_OTHER_CURRENT | 749816 | 19.524872 |
CNT_DRAWINGS_ATM_CURRENT | 749816 | 19.524872 |
AMT_DRAWINGS_ATM_CURRENT | 749816 | 19.524872 |
AMT_DRAWINGS_POS_CURRENT | 749816 | 19.524872 |
CNT_INSTALMENT_MATURE_CUM | 305236 | 7.948208 |
AMT_INST_MIN_REGULARITY | 305236 | 7.948208 |
SK_DPD_DEF | 0 | 0.000000 |
# checking missing data
total = bureau.isnull().sum().sort_values(ascending = False)
percent = (bureau.isnull().sum()/bureau.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_bureau_data.head(8)
Total | Percent | |
---|---|---|
AMT_ANNUITY | 1226791 | 71.473490 |
AMT_CREDIT_MAX_OVERDUE | 1124488 | 65.513264 |
DAYS_ENDDATE_FACT | 633653 | 36.916958 |
AMT_CREDIT_SUM_LIMIT | 591780 | 34.477415 |
AMT_CREDIT_SUM_DEBT | 257669 | 15.011932 |
DAYS_CREDIT_ENDDATE | 105553 | 6.149573 |
AMT_CREDIT_SUM | 13 | 0.000757 |
CREDIT_TYPE | 0 | 0.000000 |
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(application_train["AMT_CREDIT"])
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_INCOME_TOTAL")
ax = sns.distplot(application_train["AMT_INCOME_TOTAL"].dropna())
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_GOODS_PRICE")
ax = sns.distplot(application_train["AMT_GOODS_PRICE"].dropna())
temp = application_train["NAME_TYPE_SUITE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "Who accompanied client when applying for the application in % ",
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Name of type of the Suite in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='schoolStateNames')
temp = application_train["TARGET"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie', labels='labels', values='values', title='是否償還貸款')
訓練資料非常不平衡。
temp = application_train["NAME_CONTRACT_TYPE"].value_counts()
fig = {
"data": [
{
"values": temp.values,
"labels": temp.index,
"domain": {"x": [0, .48]},
#"name": "Types of Loans",
#"hoverinfo":"label+percent+name",
"hole": .7,
"type": "pie"
},
],
"layout": {
"title":"Types of loan",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Loan Types",
"x": 0.17,
"y": 0.5
}
]
}
}
iplot(fig, filename='donut')
temp1 = application_train["FLAG_OWN_CAR"].value_counts()
temp2 = application_train["FLAG_OWN_REALTY"].value_counts()
fig = {
"data": [
{
"values": temp1.values,
"labels": temp1.index,
"domain": {"x": [0, .48]},
"name": "是否擁有車",
"hoverinfo":"label+percent+name",
"hole": .6,
"type": "pie"
},
{
"values": temp2.values,
"labels": temp2.index,
"text":"Own Realty",
"textposition":"inside",
"domain": {"x": [.52, 1]},
"name": "是否擁有不動產",
"hoverinfo":"label+percent+name",
"hole": .6,
"type": "pie"
}],
"layout": {
"title":"Purpose of loan",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "是否擁有車",
"x": 0.20,
"y": 0.5
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "是否擁有不動產",
"x": 0.8,
"y": 0.5
}
]
}
}
iplot(fig, filename='donut')
temp = application_train["NAME_INCOME_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Income sources of Applicant\'s', hole = 0.5)
temp = application_train["NAME_FAMILY_STATUS"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Family Status of Applicant\'s', hole = 0.5)
temp = application_train["OCCUPATION_TYPE"].value_counts()
temp.iplot(kind='bar', xTitle = 'Occupation', yTitle = "Count", title = 'Occupation of Applicant\'s who applied for loan', color = 'green')
temp = application_train["NAME_EDUCATION_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Education of Applicant\'s', hole = 0.5)
temp = application_train["NAME_HOUSING_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Type of House', hole = 0.5)
temp = application_train["ORGANIZATION_TYPE"].value_counts()
temp.iplot(kind='bar', xTitle = 'Organization Name', yTitle = "Count", title = 'Types of Organizations who applied for loan ', color = 'red')
temp
Working 158774 Commercial associate 71617 Pensioner 55362 State servant 21703 Unemployed 22 Student 18 Businessman 10 Maternity leave 5 Name: NAME_INCOME_TYPE, dtype: int64
temp = application_train["NAME_INCOME_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_INCOME_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_INCOME_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的收入來源比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='收入來源',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_FAMILY_STATUS"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_FAMILY_STATUS"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_FAMILY_STATUS"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的家庭狀況比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='家庭狀況',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["OCCUPATION_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["OCCUPATION_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["OCCUPATION_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的職位比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='職位',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_EDUCATION_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_EDUCATION_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_EDUCATION_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的最高學歷比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='最高學歷',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_HOUSING_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_HOUSING_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_HOUSING_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的居住狀況比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='居住狀況',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["ORGANIZATION_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["ORGANIZATION_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["ORGANIZATION_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的所屬單位比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='所屬單位',
tickfont=dict(
size=10,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_TYPE_SUITE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_TYPE_SUITE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_TYPE_SUITE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='無償還貸款'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='有償還貸款'
)
data = [trace1, trace2]
layout = go.Layout(
title = "是否償還貸款者的陪同者比較(%)",
#barmode='stack',
width = 1000,
xaxis=dict(
title='陪同者',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
plt.figure(figsize = (16, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(application_train.loc[application_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = '有償還貸款')
# KDE plot of loans which were not repaid on time
sns.kdeplot(application_train.loc[application_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = '無償還貸款')
# Labeling of plot
plt.xlabel('Age (years)');
plt.ylabel('Density');
plt.title('年齡分佈');
年紀輕傾向無法償還貸款。
temp = previous_application["NAME_CONTRACT_TYPE"].value_counts()
fig = {
"data": [
{
"values": temp.values,
"labels": temp.index,
"domain": {"x": [0, .48]},
#"name": "Types of Loans",
#"hoverinfo":"label+percent+name",
"hole": .7,
"type": "pie"
},
],
"layout": {
"title":"之前申請的貸款合約類型",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": " 貸款合約類型",
"x": 0.12,
"y": 0.5
}
]
}
}
iplot(fig, filename='donut')
temp = previous_application["WEEKDAY_APPR_PROCESS_START"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "之前申請時為禮拜幾(%)",
xaxis=dict(
title='禮拜幾',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='schoolStateNames')
temp = previous_application["NAME_CASH_LOAN_PURPOSE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Blues',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "之前現金貸款的目的(%) ",
xaxis=dict(
title='現金貸款目的',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_CONTRACT_STATUS"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='申請結果', hole = 0.5)
temp = previous_application["NAME_PAYMENT_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='還款方式', hole = 0.7)
temp = previous_application["CODE_REJECT_REASON"].value_counts()
temp.iplot(kind='bar', xTitle = '被拒絕原因', yTitle = "Count", title = '之前申請被拒絕原因', colors=['#75e575'])
temp = previous_application["NAME_TYPE_SUITE"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = '#ea7c96',
#reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "之前申請時的陪同者(%) ",
xaxis=dict(
title='陪同者',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_CLIENT_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='之前申請時為新或舊客戶', hole = 0.7)
temp = previous_application["NAME_GOODS_CATEGORY"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Greens',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "之前申請抵押品種類(%) ",
xaxis=dict(
title='物品',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_PORTFOLIO"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Was the previous application for CASH, POS, CAR, …', hole = 0.7)
temp = previous_application["NAME_PRODUCT_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='之前申請的是 x-sell 還是 walk-in', hole = 0.7)
temp = previous_application["CHANNEL_TYPE"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
# colorscale = 'Violet',
# reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "之前申請的客戶來源管道(%)",
xaxis=dict(
title='來源管道',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_SELLER_INDUSTRY"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Greens',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "之前申請的主要銷售產業(%)",
xaxis=dict(
title='Industry Name',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_YIELD_GROUP"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Greens',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "之前申請的利率(%) ",
xaxis=dict(
title='利率種類',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='比例(%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["PRODUCT_COMBINATION"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='之前申請的詳細產品組合', hole = 0.7)
temp = previous_application["NFLAG_INSURED_ON_APPROVAL"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='之前申請貸款時是否申請保險(YES : 1, NO : 0)', hole = 0.7)
from sklearn import preprocessing
categorical_feats = [
f for f in application_train.columns if application_train[f].dtype == 'object'
]
for col in categorical_feats:
lb = preprocessing.LabelEncoder()
lb.fit(list(application_train[col].values.astype('str')) + list(application_test[col].values.astype('str')))
application_train[col] = lb.transform(list(application_train[col].values.astype('str')))
application_test[col] = lb.transform(list(application_test[col].values.astype('str')))
application_train.fillna(-999, inplace = True)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=8, min_samples_leaf=4, max_features=0.5, random_state=2018)
rf.fit(application_train.drop(['SK_ID_CURR', 'TARGET'],axis=1), application_train.TARGET)
features = application_train.drop(['SK_ID_CURR', 'TARGET'],axis=1).columns.values
x, y = (list(x) for x in zip(*sorted(zip(rf.feature_importances_, features),
reverse = False)))
trace2 = go.Bar(
x=x ,
y=y,
marker=dict(
color=x,
colorscale = 'Viridis',
reversescale = True
),
name='Random Forest Feature importance',
orientation='h',
)
layout = dict(
title='Barplot of Feature importances',
width = 900, height = 2000,
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=True,
# domain=[0, 0.85],
),
margin=dict(
l=300,
),
)
fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')
來看看最重要的四個 feature 與 target 的相關係數
ext_data = application_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
plt.figure(figsize = (8, 6))
# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('相關係數熱圖');