本文数据来自：

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

warnings.filterwarnings("ignore")

总体了解¶

这一部分主要了解数据集整体情况，包括但不限于：

数据集规模
特征数
有哪些特征
不同特征数据类型
不同特征样本数
不同特征均值、标准差、中位等

In [2]:

data_train = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/testA.csv")

In [3]:

data_train.shape, data_test.shape

Out[3]:

((800000, 47), (200000, 48))

In [4]:

data_train.columns

Out[4]:

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')

In [5]:

data_test.columns

Out[5]:

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1',
       'n2.2', 'n2.3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12',
       'n13', 'n14'],
      dtype='object')

In [6]:

set(data_test.columns) - set(data_train.columns) 

Out[6]:

{'n2.2', 'n2.3'}

In [7]:

set(data_train.columns) - set(data_test.columns) 

Out[7]:

{'isDefault'}

In [8]:

data_train.dtypes

Out[8]:

id                      int64
loanAmnt              float64
term                    int64
interestRate          float64
installment           float64
grade                  object
subGrade               object
employmentTitle       float64
employmentLength       object
homeOwnership           int64
annualIncome          float64
verificationStatus      int64
issueDate              object
isDefault               int64
purpose                 int64
postCode              float64
regionCode              int64
dti                   float64
delinquency_2years    float64
ficoRangeLow          float64
ficoRangeHigh         float64
openAcc               float64
pubRec                float64
pubRecBankruptcies    float64
revolBal              float64
revolUtil             float64
totalAcc              float64
initialListStatus       int64
applicationType         int64
earliesCreditLine      object
title                 float64
policyCode            float64
n0                    float64
n1                    float64
n2                    float64
n2.1                  float64
n4                    float64
n5                    float64
n6                    float64
n7                    float64
n8                    float64
n9                    float64
n10                   float64
n11                   float64
n12                   float64
n13                   float64
n14                   float64
dtype: object

In [9]:

data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id                    800000 non-null int64
loanAmnt              800000 non-null float64
term                  800000 non-null int64
interestRate          800000 non-null float64
installment           800000 non-null float64
grade                 800000 non-null object
subGrade              800000 non-null object
employmentTitle       799999 non-null float64
employmentLength      753201 non-null object
homeOwnership         800000 non-null int64
annualIncome          800000 non-null float64
verificationStatus    800000 non-null int64
issueDate             800000 non-null object
isDefault             800000 non-null int64
purpose               800000 non-null int64
postCode              799999 non-null float64
regionCode            800000 non-null int64
dti                   799761 non-null float64
delinquency_2years    800000 non-null float64
ficoRangeLow          800000 non-null float64
ficoRangeHigh         800000 non-null float64
openAcc               800000 non-null float64
pubRec                800000 non-null float64
pubRecBankruptcies    799595 non-null float64
revolBal              800000 non-null float64
revolUtil             799469 non-null float64
totalAcc              800000 non-null float64
initialListStatus     800000 non-null int64
applicationType       800000 non-null int64
earliesCreditLine     800000 non-null object
title                 799999 non-null float64
policyCode            800000 non-null float64
n0                    759730 non-null float64
n1                    759730 non-null float64
n2                    759730 non-null float64
n2.1                  759730 non-null float64
n4                    766761 non-null float64
n5                    759730 non-null float64
n6                    759730 non-null float64
n7                    759730 non-null float64
n8                    759729 non-null float64
n9                    759730 non-null float64
n10                   766761 non-null float64
n11                   730248 non-null float64
n12                   759730 non-null float64
n13                   759730 non-null float64
n14                   759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB

In [10]:

data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 48 columns):
id                    200000 non-null int64
loanAmnt              200000 non-null float64
term                  200000 non-null int64
interestRate          200000 non-null float64
installment           200000 non-null float64
grade                 200000 non-null object
subGrade              200000 non-null object
employmentTitle       200000 non-null float64
employmentLength      188258 non-null object
homeOwnership         200000 non-null int64
annualIncome          200000 non-null float64
verificationStatus    200000 non-null int64
issueDate             200000 non-null object
purpose               200000 non-null int64
postCode              200000 non-null float64
regionCode            200000 non-null int64
dti                   199939 non-null float64
delinquency_2years    200000 non-null float64
ficoRangeLow          200000 non-null float64
ficoRangeHigh         200000 non-null float64
openAcc               200000 non-null float64
pubRec                200000 non-null float64
pubRecBankruptcies    199884 non-null float64
revolBal              200000 non-null float64
revolUtil             199873 non-null float64
totalAcc              200000 non-null float64
initialListStatus     200000 non-null int64
applicationType       200000 non-null int64
earliesCreditLine     200000 non-null object
title                 200000 non-null float64
policyCode            200000 non-null float64
n0                    189889 non-null float64
n1                    189889 non-null float64
n2                    189889 non-null float64
n2.1                  189889 non-null float64
n2.2                  189889 non-null float64
n2.3                  189889 non-null float64
n4                    191606 non-null float64
n5                    189889 non-null float64
n6                    189889 non-null float64
n7                    189889 non-null float64
n8                    189889 non-null float64
n9                    189889 non-null float64
n10                   191606 non-null float64
n11                   182425 non-null float64
n12                   189889 non-null float64
n13                   189889 non-null float64
n14                   189889 non-null float64
dtypes: float64(35), int64(8), object(5)
memory usage: 73.2+ MB

In [11]:

data_train.describe().T

Out[11]:

	count	mean	std	min	25%	50%	75%	max
id	800000.0	399999.500000	230940.252013	0.00	199999.75	399999.500	599999.25	799999.00
loanAmnt	800000.0	14416.818875	8716.086178	500.00	8000.00	12000.000	20000.00	40000.00
term	800000.0	3.482745	0.855832	3.00	3.00	3.000	3.00	5.00
interestRate	800000.0	13.238391	4.765757	5.31	9.75	12.740	15.99	30.99
installment	800000.0	437.947723	261.460393	15.69	248.45	375.135	580.71	1715.42
employmentTitle	799999.0	72005.351714	106585.640204	0.00	427.00	7755.000	117663.50	378351.00
homeOwnership	800000.0	0.614213	0.675749	0.00	0.00	1.000	1.00	5.00
annualIncome	800000.0	76133.910493	68947.513672	0.00	45600.00	65000.000	90000.00	10999200.00
verificationStatus	800000.0	1.009683	0.782716	0.00	0.00	1.000	2.00	2.00
isDefault	800000.0	0.199513	0.399634	0.00	0.00	0.000	0.00	1.00
purpose	800000.0	1.745982	2.367453	0.00	0.00	0.000	4.00	13.00
postCode	799999.0	258.535648	200.037446	0.00	103.00	203.000	395.00	940.00
regionCode	800000.0	16.385758	11.036679	0.00	8.00	14.000	22.00	50.00
dti	799761.0	18.284557	11.150155	-1.00	11.79	17.610	24.06	999.00
delinquency_2years	800000.0	0.318239	0.880325	0.00	0.00	0.000	0.00	39.00
ficoRangeLow	800000.0	696.204081	31.865995	630.00	670.00	690.000	710.00	845.00
ficoRangeHigh	800000.0	700.204226	31.866674	634.00	674.00	694.000	714.00	850.00
openAcc	800000.0	11.598020	5.475286	0.00	8.00	11.000	14.00	86.00
pubRec	800000.0	0.214915	0.606467	0.00	0.00	0.000	0.00	86.00
pubRecBankruptcies	799595.0	0.134163	0.377471	0.00	0.00	0.000	0.00	12.00
revolBal	800000.0	16228.706505	22458.020544	0.00	5944.00	11132.000	19734.00	2904836.00
revolUtil	799469.0	51.790734	24.516126	0.00	33.40	52.100	70.70	892.30
totalAcc	800000.0	24.998861	11.999201	2.00	16.00	23.000	32.00	162.00
initialListStatus	800000.0	0.416953	0.493055	0.00	0.00	0.000	1.00	1.00
applicationType	800000.0	0.019267	0.137464	0.00	0.00	0.000	0.00	1.00
title	799999.0	1754.113589	7941.474040	0.00	0.00	1.000	5.00	61680.00
policyCode	800000.0	1.000000	0.000000	1.00	1.00	1.000	1.00	1.00
n0	759730.0	0.511932	1.333266	0.00	0.00	0.000	0.00	51.00
n1	759730.0	3.642330	2.246825	0.00	2.00	3.000	5.00	33.00
n2	759730.0	5.642648	3.302810	0.00	3.00	5.000	7.00	63.00
n2.1	759730.0	5.642648	3.302810	0.00	3.00	5.000	7.00	63.00
n4	766761.0	4.735641	2.949969	0.00	3.00	4.000	6.00	49.00
n5	759730.0	8.107937	4.799210	0.00	5.00	7.000	11.00	70.00
n6	759730.0	8.575994	7.400536	0.00	4.00	7.000	11.00	132.00
n7	759730.0	8.282953	4.561689	0.00	5.00	7.000	10.00	79.00
n8	759729.0	14.622488	8.124610	1.00	9.00	13.000	19.00	128.00
n9	759730.0	5.592345	3.216184	0.00	3.00	5.000	7.00	45.00
n10	766761.0	11.643896	5.484104	0.00	8.00	11.000	14.00	82.00
n11	730248.0	0.000815	0.030075	0.00	0.00	0.000	0.00	4.00
n12	759730.0	0.003384	0.062041	0.00	0.00	0.000	0.00	4.00
n13	759730.0	0.089366	0.509069	0.00	0.00	0.000	0.00	39.00
n14	759730.0	2.178606	1.844377	0.00	1.00	2.000	3.00	30.00

In [12]:

data_train.head().append(data_train.tail()).T

Out[12]:

	0	1	2	3	4	799995	799996	799997	799998	799999
id	0	1	2	3	4	799995	799996	799997	799998	799999
loanAmnt	35000	18000	12000	11000	3000	25000	17000	6000	19200	9000
term	5	5	5	3	3	3	3	3	3	3
interestRate	19.52	18.49	16.99	7.26	12.99	14.49	7.9	13.33	6.92	11.06
installment	917.97	461.9	298.17	340.96	101.07	860.41	531.94	203.12	592.14	294.91
grade	E	D	D	A	C	C	A	C	A	B
subGrade	E2	D2	D3	A4	C2	C4	A4	C3	A4	B3
employmentTitle	320	219843	31698	46854	54	2659	29205	2582	151	13
employmentLength	2 years	5 years	8 years	10+ years	NaN	7 years	10+ years	10+ years	10+ years	5 years
homeOwnership	2	0	0	1	1	1	0	1	0	0
annualIncome	110000	46000	74000	118000	29000	72000	99000	65000	96000	120000
verificationStatus	2	2	2	1	2	0	2	2	2	0
issueDate	2014-07-01	2012-08-01	2015-10-01	2015-08-01	2016-03-01	2016-07-01	2013-04-01	2015-10-01	2015-02-01	2018-08-01
isDefault	1	0	0	0	0	0	0	1	0	0
purpose	1	0	0	4	10	0	4	0	4	4
postCode	137	156	337	148	301	242	563	47	34	62
regionCode	32	18	14	11	21	8	10	17	18	13
dti	17.05	27.83	22.77	17.21	32.16	19.03	15.72	12.11	29.25	8.99
delinquency_2years	0	0	0	0	0	0	1	1	0	0
ficoRangeLow	730	700	675	685	690	710	720	670	675	695
ficoRangeHigh	734	704	679	689	694	714	724	674	679	699
openAcc	7	13	11	9	12	14	7	5	16	7
pubRec	0	0	0	0	0	0	0	0	0	0
pubRecBankruptcies	0	0	0	0	0	0	0	0	0	0
revolBal	24178	15096	4606	9948	2942	9933	20472	6381	69702	8420
revolUtil	48.9	38.9	51.8	52.6	32	46.4	98.4	51.9	61.3	72.6
totalAcc	27	18	27	28	27	15	42	36	37	13
initialListStatus	0	1	0	1	0	1	0	1	1	0
applicationType	0	0	0	0	0	0	0	0	0	0
earliesCreditLine	Aug-2001	May-2002	May-2006	May-1999	Aug-1977	Aug-2011	May-1989	Jul-2002	Jan-1994	Feb-2002
title	1	1723	0	4	11	0	33369	0	4	4
policyCode	1	1	1	1	1	1	1	1	1	1
n0	0	NaN	0	6	1	0	0	2	0	2
n1	2	NaN	0	4	2	5	2	1	5	2
n2	2	NaN	3	6	7	10	2	4	8	3
n2.1	2	NaN	3	6	7	10	2	4	8	3
n4	4	10	0	4	2	6	2	1	7	2
n5	9	NaN	0	16	4	6	15	4	10	3
n6	8	NaN	21	4	9	2	16	26	6	4
n7	4	NaN	4	7	10	12	2	4	12	4
n8	12	NaN	5	21	15	13	19	10	22	8
n9	2	NaN	3	6	7	10	2	4	8	3
n10	7	13	11	9	12	14	7	5	16	7
n11	0	NaN	0	0	0	0	0	0	0	0
n12	0	NaN	0	0	0	0	0	0	0	0
n13	0	NaN	0	0	0	0	0	1	0	0
n14	2	NaN	4	1	4	3	0	4	5	2

缺失值和唯一值¶

主要探索：

如何处理特征的缺失值？
如何处理唯一值的特征？

缺失值¶

In [13]:

# 判断有多少个缺失特征
n_missing = data_train.isna().any().sum()
# 类似：data_train.isnull().any().sum()
n_missing

Out[13]:

In [14]:

def feature_null_more_than_ratio(data: pd.DataFrame, ratio: float) -> dict:
    have_null_fea_dict = (data.isnull().sum()/len(data)).to_dict()
    res = {}
    for key,val in have_null_fea_dict.items():
        if val > ratio:
            res[key] = val
    return res

In [15]:

# 缺失值超过一定比例的特征
for ratio in np.linspace(0.1, 1, 10):
    dct = feature_null_more_than_ratio(data_train, ratio)
    print("features null ratio bigger than {}: {}".format(ratio, dct))

features null ratio bigger than 0.1: {}
features null ratio bigger than 0.2: {}
features null ratio bigger than 0.30000000000000004: {}
features null ratio bigger than 0.4: {}
features null ratio bigger than 0.5: {}
features null ratio bigger than 0.6: {}
features null ratio bigger than 0.7000000000000001: {}
features null ratio bigger than 0.8: {}
features null ratio bigger than 0.9: {}
features null ratio bigger than 1.0: {}

In [16]:

# 不同特征缺失值比例
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 4))

missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(ax = ax);

In [278]:

# 有缺失值的特征数据分布
fig, axes = plt.subplots(nrows=n_missing, ncols=1, figsize=(12, 3*n_missing))
for i, (key, val) in enumerate(missing.to_dict().items()):
    val = int(val * len(data_train))
    axes[i].hist(data_train[key].dropna());
    axes[i].set_title(key +"/" + str(val), loc='right', pad=1.0);

对于缺失值的处理：

employmentTitle，postCode，title：众数
employmentLength：转换后平均或分桶后取样本数最多的桶
dti，revolUtil：：加权平均
pubRecBankruptcies，n14，n13，n12，n11，n10，n9，n8，n7，n6，n5，n4，n2，n1，n0：加权平均后取整

In [96]:

fea = "employmentLength"
data_train[fea].dropna().value_counts()

Out[96]:

10+ years    262753
2 years       72358
< 1 year      64237
3 years       64152
1 year        52489
5 years       50102
4 years       47985
6 years       37254
8 years       36192
7 years       35407
9 years       30272
Name: employmentLength, dtype: int64

唯一值¶

对唯一值直接去掉：

policyCode

In [97]:

[col for col in data_train.columns if data_train[col].nunique() <= 1]

Out[97]:

['policyCode']

In [100]:

[col for col in data_test.columns if data_test[col].nunique() <= 1]

Out[100]:

['policyCode']

数据类型¶

数据特征类型一般包括以下几类：

数值数据：定量数据
- 离散数据
- 连续数据
分类数据：定性数据

度量数据有以下不同方法：

标签
序数
区间
比率

数值数据一般可以直接用于模型，但风控时往往要对其进行分桶，对其 WOE 编码后再使用，这里提到过，这其实是一种非线性的思路，能够降低模型复杂度，减少噪声影响，使得模型更加稳定。

主要探索：

数据分布如何？
如何转换数据？

In [558]:

# unique
for fea in data_train.columns:
    print("Feature:", fea, "\t",
          "Number Unique:", len(data_train[fea].unique()), "\t",
          "Sample:", data_train[fea].iloc[0])

Feature: id 	 Number Unique: 800000 	 Sample: 0
Feature: loanAmnt 	 Number Unique: 1540 	 Sample: 35000.0
Feature: term 	 Number Unique: 2 	 Sample: 5
Feature: interestRate 	 Number Unique: 641 	 Sample: 19.52
Feature: installment 	 Number Unique: 72360 	 Sample: 917.97
Feature: grade 	 Number Unique: 7 	 Sample: E
Feature: subGrade 	 Number Unique: 35 	 Sample: E2
Feature: employmentTitle 	 Number Unique: 248684 	 Sample: 320.0
Feature: employmentLength 	 Number Unique: 12 	 Sample: 2 years
Feature: homeOwnership 	 Number Unique: 6 	 Sample: 2
Feature: annualIncome 	 Number Unique: 44926 	 Sample: 110000.0
Feature: verificationStatus 	 Number Unique: 3 	 Sample: 2
Feature: issueDate 	 Number Unique: 139 	 Sample: 2014-07-01
Feature: isDefault 	 Number Unique: 2 	 Sample: 1
Feature: purpose 	 Number Unique: 14 	 Sample: 1
Feature: postCode 	 Number Unique: 933 	 Sample: 137.0
Feature: regionCode 	 Number Unique: 51 	 Sample: 32
Feature: dti 	 Number Unique: 6322 	 Sample: 17.05
Feature: delinquency_2years 	 Number Unique: 30 	 Sample: 0.0
Feature: ficoRangeLow 	 Number Unique: 39 	 Sample: 730.0
Feature: ficoRangeHigh 	 Number Unique: 39 	 Sample: 734.0
Feature: openAcc 	 Number Unique: 75 	 Sample: 7.0
Feature: pubRec 	 Number Unique: 32 	 Sample: 0.0
Feature: pubRecBankruptcies 	 Number Unique: 12 	 Sample: 0.0
Feature: revolBal 	 Number Unique: 71116 	 Sample: 24178.0
Feature: revolUtil 	 Number Unique: 1287 	 Sample: 48.9
Feature: totalAcc 	 Number Unique: 134 	 Sample: 27.0
Feature: initialListStatus 	 Number Unique: 2 	 Sample: 0
Feature: applicationType 	 Number Unique: 2 	 Sample: 0
Feature: earliesCreditLine 	 Number Unique: 720 	 Sample: Aug-2001
Feature: title 	 Number Unique: 39645 	 Sample: 1.0
Feature: policyCode 	 Number Unique: 1 	 Sample: 1.0
Feature: n0 	 Number Unique: 40 	 Sample: 0.0
Feature: n1 	 Number Unique: 34 	 Sample: 2.0
Feature: n2 	 Number Unique: 51 	 Sample: 2.0
Feature: n2.1 	 Number Unique: 51 	 Sample: 2.0
Feature: n4 	 Number Unique: 47 	 Sample: 4.0
Feature: n5 	 Number Unique: 66 	 Sample: 9.0
Feature: n6 	 Number Unique: 108 	 Sample: 8.0
Feature: n7 	 Number Unique: 71 	 Sample: 4.0
Feature: n8 	 Number Unique: 103 	 Sample: 12.0
Feature: n9 	 Number Unique: 45 	 Sample: 2.0
Feature: n10 	 Number Unique: 77 	 Sample: 7.0
Feature: n11 	 Number Unique: 6 	 Sample: 0.0
Feature: n12 	 Number Unique: 6 	 Sample: 0.0
Feature: n13 	 Number Unique: 29 	 Sample: 0.0
Feature: n14 	 Number Unique: 32 	 Sample: 2.0

In [281]:

# 不同数据类型
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
print("Numerical Features: ", len(numerical_fea))
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
print("Category Features: ", len(category_fea))
for fea in category_fea:
    print("Feature: ", fea, "\t",
          "Number Unique:", len(data_train[fea].unique()), "\t",
          "Sample: ", data_train[fea].iloc[0])

Numerical Features:  42
Category Features:  5
Feature:  grade 	 Number Unique: 7 	 Sample:  E
Feature:  subGrade 	 Number Unique: 35 	 Sample:  E2
Feature:  employmentLength 	 Number Unique: 12 	 Sample:  2 years
Feature:  issueDate 	 Number Unique: 139 	 Sample:  2014-07-01
Feature:  earliesCreditLine 	 Number Unique: 720 	 Sample:  Aug-2001

In [763]:

def is_float(x):
    return bool(int(x) - x)

def all_unique_is_float(arr):
    for i in arr:
        if is_float(i):
            return True
    return False

In [775]:

# 离散/连续数据类型
# numerical_serial_fea = [fea for fea in numerical_fea if data_train[fea].nunique() > 10]
numerical_serial_fea = [fea for fea in numerical_fea if 
                        all_unique_is_float(data_train[fea].dropna().unique())
                       or data_train[fea].nunique() > 50]
numerical_discrate_fea = [fea for fea in numerical_fea if fea not in numerical_serial_fea]

In [776]:

numerical_discrate_fea

Out[776]:

['term',
 'homeOwnership',
 'verificationStatus',
 'isDefault',
 'purpose',
 'delinquency_2years',
 'ficoRangeLow',
 'ficoRangeHigh',
 'pubRec',
 'pubRecBankruptcies',
 'initialListStatus',
 'applicationType',
 'policyCode',
 'n0',
 'n1',
 'n2',
 'n2.1',
 'n4',
 'n9',
 'n11',
 'n12',
 'n13',
 'n14']

In [777]:

numerical_serial_fea

Out[777]:

['id',
 'loanAmnt',
 'interestRate',
 'installment',
 'employmentTitle',
 'annualIncome',
 'postCode',
 'regionCode',
 'dti',
 'openAcc',
 'revolBal',
 'revolUtil',
 'totalAcc',
 'title',
 'n5',
 'n6',
 'n7',
 'n8',
 'n10']

In [656]:

# value counts
vc_features = features #category_fea + numerical_discrate_fea
rows = len(vc_features)
fig, axes = plt.subplots(nrows=rows, ncols=1, figsize=(12, 4*rows))
for i, fea in enumerate(vc_features):
#     sns.barplot(
#         x=list(data_train[fea].value_counts())[:10], 
#         y=data_train[fea].value_counts().keys()[:10],
#         ax = axes[i]
#     )
    data_train[fea].value_counts().nlargest(10).plot(
        kind='barh', ax=axes[i])
    axes[i].set_title(fea)

In [371]:

# 看分布（离散/连续数据类型）进一步验证
features = category_fea + numerical_discrate_fea + numerical_serial_fea
df = data_train[features]

In [559]:

figsize = (18, 40)
cols = 4
rows = len(features) // cols + 1

def trim_axs(axs, N):
    axs = axs.flat
    for ax in axs[N:]:
        ax.remove()
    return axs[:N]

In [569]:

def plot_hist(fea_data, ax):
    if fea_data.dtype == "object":
        ax.hist(fea_data, bins=15)
    else:
        try:
            sns.distplot(fea_data, ax=ax, axlabel=False)
        except Exception as e:
            ax.hist(fea_data, bins=15)

In [570]:

axs = plt.figure(figsize=figsize, constrained_layout=False).subplots(rows, cols)
axs = trim_axs(axs, len(features))

for ax, fea in zip(axs, features):
    fea_data = df[fea].dropna()
    ax.set_title(fea)
    plot_hist(fea_data, ax)

类别数据：

grade 贷款等级
subGrade 贷款等级之子级
employmentLength 就业年限（年）
issueDate 贷款发放的月份
earliesCreditLine 借款人最早报告的信用额度开立的月份

数值数据：

id
loanAmnt 贷款金额
interestRate 贷款利率
installment 分期付款金额
employmentTitle 就业职称
annualIncome 年收入
postCode 借款人在贷款申请时的贷款用途类别
regionCode 地区编码
purpose 借款人在贷款申请时的贷款用途类别
dti 债务收入比
delinquency_2years 借款人过去2年信用档案中逾期30天以上的违约事件数
ficoRangeLow 借款人在贷款发放时的fico所属的下限范围
ficoRangeHigh 借款人在贷款发放时的fico所属的上限范围
openAcc 借款人信用档案中未结信用额度的数量
pubRec 贬损公共记录的数量
pubRecBankruptcies 公开记录清除的数量
revolBal 信贷周转余额合计
revolUtil 循环额度利用率，或借款人使用的相对于所有可用循环信贷的信贷金额
totalAcc 借款人信用档案中当前的信用额度总数
title 借款人提供的贷款名称
n 系列：n0 n1 n2 n2.1 n4 n5 n6 n7 n8 n9 n10 n13 n14
term 贷款期限
homeOwnership 借款人在登记时提供的房屋所有权状况
verificationStatus 验证状态
initialListStatus 贷款的初始列表状态
applicationType 表明贷款是个人申请还是与两个共同借款人的联合申请
n 系列：n11 n12

In [435]:

set(data_test["purpose"].dropna().to_list())

Out[435]:

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}

In [466]:

# 偏态
skews = data_train[numerical_serial_fea].skew()
skews

Out[466]:

id                    1.586291e-17
loanAmnt              7.826221e-01
interestRate          7.123646e-01
installment           1.006328e+00
employmentTitle       1.379124e+00
annualIncome          4.618773e+01
purpose               1.251126e+00
postCode              8.321358e-01
regionCode            7.824390e-01
dti                   2.730132e+01
delinquency_2years    5.685041e+00
ficoRangeLow          1.282404e+00
ficoRangeHigh         1.282608e+00
openAcc               1.298413e+00
pubRec                1.352173e+01
pubRecBankruptcies    3.440526e+00
revolBal              1.592468e+01
revolUtil            -1.805942e-02
totalAcc              9.560535e-01
title                 5.225696e+00
n0                    5.106909e+00
n1                    1.400085e+00
n2                    1.542892e+00
n2.1                  1.542892e+00
n4                    1.701304e+00
n5                    1.339371e+00
n6                    2.063377e+00
n7                    1.471781e+00
n8                    1.303497e+00
n9                    1.432905e+00
n10                   1.296988e+00
n13                   1.363116e+01
n14                   1.486073e+00
dtype: float64

In [526]:

def addone(x):
    return x + 1e-5

In [539]:

new_data

Out[539]:

0         2.836151
1         3.326115
2         3.125444
3         2.845491
4         3.470724
            ...   
799995    2.946017
799996    2.754934
799997    2.494032
799998    3.375880
799999    2.196114
Name: dti, Length: 799759, dtype: float64

In [546]:

# 非正态的尝试 log 化
# sns.FacetGrid(df, size=2).map(sns.distplot, fea, ax=ax).add_legend()
skew_dict = skews.to_dict()
fig, axes = plt.subplots(nrows=len(skew_dict), ncols=2, figsize=(12, 3*len(skew_dict)))

for i, (key, val) in enumerate(skew_dict.items()):
    fea_data = df[key].dropna()
    plot_hist(fea_data, ax=axes[i, 0])
    axes[i, 0].set_title(key +"/" + str(val), loc='right', pad=1.0)
    
    new_data = fea_data.dropna().apply(addone).apply(np.log).dropna()
    plot_hist(new_data, ax=axes[i, 1])
    new_val = new_data.skew()
    axes[i, 1].set_title(key +"/" + str(new_val), loc='right', pad=1.0)

In [573]:

# 不同特征值在不同 label 上的分布
fig, axes = plt.subplots(nrows=len(features), ncols=2, figsize=(12, 3*len(features)))

for i, fea in enumerate(features):
    fea_data = data_train[data_train.isDefault == 1][fea].dropna()
    plot_hist(fea_data, ax=axes[i, 0])
    axes[i, 0].set_title(fea + "  Label=1", loc='right', pad=1.0)
    
    fea_data = data_train[data_train.isDefault == 0][fea].dropna()
    plot_hist(fea_data, ax=axes[i, 1])
    axes[i, 1].set_title(fea + "  Label=0", loc='right', pad=1.0)

In [400]:

# category 编码
import category_encoders as ce

fea = "grade"

encoder1 = ce.OneHotEncoder(cols=[fea])
encoder2 = ce.OrdinalEncoder(cols=[fea])
encoder3 = ce.TargetEncoder(cols=[fea])
encoder4 = ce.CatBoostEncoder(cols=[fea])

tmp = data_train

X = tmp[fea]
y = tmp.isDefault

encoder3.fit(X, y)
encoder3.transform(X)

数据间关系¶

主要探索：

特征之间的关系？
数据和因变量之间的关系？

In [780]:

rows = len(features)
need_drop_feas = ["policyCode", "isDefault"]
fig, axes = plt.subplots(nrows=rows, ncols=1, figsize=(12, 4*rows))
g = sns.FacetGrid(data_train, hue="isDefault")
for i, fea in enumerate(features):
    ax.set_title(fea)
    if fea in category_fea + numerical_discrate_fea:
        sns.countplot(fea, hue="isDefault", data=data_train, ax=axes[i])
    else:
        g = g.map(sns.distplot, fea, hist=False, rug=False, ax=axes[i])

In [793]:

df = data_train.drop(columns=['policyCode'])
df = df.dropna()

In [794]:

grade_dct = dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'], range(10, 80, 10)))
def deal_grade(x):
    return grade_dct.get(x)
df["grade"] = df["grade"].apply(deal_grade)

In [795]:

def deal_sub_grade(x):
    return grade_dct.get(x[0]) + int(x[1])
df["subGrade"] = df["subGrade"].apply(deal_sub_grade)

In [796]:

def deal_employmentLength(x):
    if x == "10+ years":
        return 10.0
    elif x == "< 1 year":
        return 0.5
    else:
        return float(x[0])
df["employmentLength"] = df["employmentLength"].apply(deal_employmentLength)

In [798]:

def deal_date(x):
    return int(x.split("-")[1])
df["issueDate"] = df["issueDate"].apply(deal_date)
df["earliesCreditLine"] = df["earliesCreditLine"].apply(deal_date)

In [801]:

df.shape

Out[801]:

(686195, 46)

In [841]:

sns.pairplot(df)

Out[841]:

<seaborn.axisgrid.PairGrid at 0x3032761d0>

In [802]:

correlation = df.corr(method='pearson')
correlation

Out[802]:

	id	loanAmnt	term	interestRate	installment	grade	subGrade	employmentTitle	employmentLength	homeOwnership	...	n5	n6	n7	n8	n9	n10	n11	n12	n13	n14
id	1.000000	-0.000137	-0.000473	0.000891	0.000064	0.000726	0.000829	-0.000321	0.000131	0.000480	...	0.001224	-0.000005	-0.000774	0.000325	-0.001075	-0.000716	-0.001491	0.001352	-0.000549	0.000910
loanAmnt	-0.000137	1.000000	0.379981	0.138414	0.952859	0.138764	0.141862	-0.013124	0.084307	-0.135043	...	0.199807	0.068186	0.161663	0.172961	0.144367	0.169741	-0.000195	0.001058	-0.019254	-0.022880
term	-0.000473	0.379981	1.000000	0.424135	0.139269	0.432181	0.439224	-0.002889	0.053551	-0.081535	...	0.050838	0.068340	0.049326	0.057561	0.051015	0.067234	0.000078	-0.000664	-0.009914	0.025177
interestRate	0.000891	0.138414	0.424135	1.000000	0.149650	0.955392	0.973036	0.088540	-0.007121	0.054506	...	-0.078071	0.009437	-0.002851	-0.050779	0.093593	-0.006641	0.004572	0.015520	0.026478	0.207397
installment	0.000064	0.952859	0.139269	0.149650	1.000000	0.140537	0.143903	-0.003201	0.072138	-0.109258	...	0.186735	0.052563	0.157911	0.160584	0.151995	0.159538	0.000639	0.003581	-0.013776	-0.002539
grade	0.000726	0.138764	0.432181	0.955392	0.140537	1.000000	0.993899	0.025339	-0.007262	0.055954	...	-0.081755	0.014100	-0.000357	-0.048262	0.094926	-0.001089	0.005209	0.018756	0.027621	0.213758
subGrade	0.000829	0.141862	0.439224	0.973036	0.143903	0.993899	1.000000	0.025738	-0.007805	0.057834	...	-0.084117	0.013915	-0.000640	-0.049757	0.097255	-0.001628	0.005506	0.019117	0.028449	0.218171
employmentTitle	-0.000321	-0.013124	-0.002889	0.088540	-0.003201	0.025339	0.025738	1.000000	0.004445	-0.012711	...	0.035327	-0.040999	-0.010159	0.003841	0.009572	-0.034370	-0.002532	-0.008490	-0.010400	-0.041281
employmentLength	0.000131	0.084307	0.053551	-0.007121	0.072138	-0.007262	-0.007805	0.004445	1.000000	-0.126093	...	0.112959	-0.050802	0.111677	0.145870	0.124596	0.035308	0.003859	0.007286	-0.004126	0.022276
homeOwnership	0.000480	-0.135043	-0.081535	0.054506	-0.109258	0.055954	0.057834	-0.012711	-0.126093	1.000000	...	-0.057957	-0.059186	-0.036628	-0.072769	-0.042051	-0.086234	-0.000435	-0.010579	0.003331	-0.029507
annualIncome	0.001259	0.301299	0.043118	-0.078382	0.292231	-0.076288	-0.078431	-0.025936	0.063834	-0.107037	...	0.126979	0.092375	0.074633	0.102526	0.064542	0.121144	0.006112	0.011483	0.006949	0.050277
verificationStatus	0.001157	0.230375	0.126160	0.262485	0.235710	0.249371	0.258806	0.050524	0.020118	-0.003181	...	0.026460	0.013775	0.022258	0.021981	0.049278	0.017713	0.016004	0.006458	0.012015	0.116825
issueDate	-0.000229	-0.023626	0.006581	0.011330	-0.025476	0.006067	0.006602	0.015922	-0.005216	0.000923	...	-0.012169	-0.000439	-0.012611	-0.010878	-0.018482	-0.010512	-0.004951	-0.003090	0.000448	-0.002945
isDefault	-0.000356	0.069370	0.182203	0.263130	0.054570	0.267691	0.272425	-0.016846	-0.018737	0.054330	...	-0.017145	0.010387	0.032932	-0.005063	0.068545	0.029741	-0.000579	0.003322	0.009915	0.086587
purpose	-0.000579	-0.147691	-0.089671	-0.055638	-0.142893	-0.054401	-0.056373	-0.005748	-0.033322	0.052736	...	-0.025463	-0.020507	-0.041203	-0.042298	-0.054592	-0.040127	-0.000217	-0.000382	0.002994	-0.027021
postCode	0.001272	-0.024268	0.012460	0.007534	-0.028208	0.008459	0.008483	-0.003466	0.038401	-0.037671	...	-0.033607	0.055521	-0.029423	-0.015324	-0.028994	-0.018950	-0.002364	0.001450	-0.007048	0.001597
regionCode	-0.000717	-0.011491	0.005560	0.001958	-0.013323	0.002936	0.002923	-0.006566	0.002374	-0.023216	...	-0.005430	0.028212	-0.013225	-0.008540	-0.014741	-0.003899	0.001146	0.001680	-0.002171	0.004583
dti	-0.000958	0.025903	0.071209	0.167799	0.028820	0.170502	0.174621	-0.026998	0.019488	0.001245	...	0.061254	0.211291	0.159712	0.118111	0.225537	0.259652	-0.003910	0.006727	-0.016499	0.081727
delinquency_2years	0.001043	-0.005619	-0.010545	0.045488	0.004096	0.049305	0.050603	-0.023783	0.018421	-0.029148	...	0.026756	0.087296	0.003992	0.070817	-0.013445	0.047102	0.043372	0.105131	0.646466	-0.033470
ficoRangeLow	-0.000322	0.112148	-0.004106	-0.392944	0.063314	-0.391667	-0.403055	-0.007319	0.024920	-0.067967	...	0.058472	-0.014259	0.011768	0.024845	-0.192841	0.028583	-0.017020	-0.031263	-0.098212	-0.086812
ficoRangeHigh	-0.000322	0.112148	-0.004106	-0.392941	0.063314	-0.391663	-0.403052	-0.007320	0.024921	-0.067968	...	0.058472	-0.014261	0.011767	0.024845	-0.192842	0.028582	-0.017020	-0.031262	-0.098211	-0.086814
openAcc	-0.000701	0.168866	0.067096	-0.006357	0.158727	-0.000702	-0.001240	-0.034619	0.035837	-0.086642	...	0.509541	0.399433	0.823184	0.646144	0.652567	0.999338	0.004172	0.015473	0.014369	0.363653
pubRec	0.000460	-0.060644	-0.015219	0.049681	-0.050015	0.056102	0.057032	-0.039684	0.027941	0.004852	...	-0.010928	-0.009888	0.004558	0.011628	-0.006377	-0.020936	0.001511	-0.005606	-0.013455	0.079578
pubRecBankruptcies	-0.000370	-0.087840	-0.004207	0.056363	-0.081118	0.058168	0.059287	-0.021444	0.024096	0.005230	...	0.005001	-0.001176	0.011661	0.045122	-0.017972	-0.021149	-0.002089	-0.012704	-0.026798	0.092174
revolBal	-0.000099	0.308759	0.071543	-0.026235	0.296700	-0.026196	-0.026220	-0.005905	0.081371	-0.101426	...	0.219975	0.012560	0.231218	0.212808	0.291450	0.214786	-0.003688	0.002164	-0.034631	-0.025807
revolUtil	0.000617	0.103068	0.064150	0.235426	0.120643	0.230288	0.237172	0.060622	0.034967	-0.042104	...	-0.126573	0.013895	-0.205260	-0.176209	0.117551	-0.145124	-0.012407	-0.020569	-0.010275	-0.221510
totalAcc	0.000222	0.199569	0.097962	-0.042996	0.175993	-0.037956	-0.039559	-0.021599	0.097790	-0.148723	...	0.616773	0.688800	0.552832	0.755643	0.388082	0.701334	0.009725	0.020205	0.067643	0.324512
initialListStatus	0.002100	-0.045649	-0.102937	0.139105	0.003158	0.122242	0.119932	0.169288	-0.002766	0.011639	...	0.014952	-0.027163	-0.010947	0.001960	0.025139	-0.024308	0.000560	0.002122	0.003855	0.002873
applicationType	0.000606	0.076632	0.042733	0.040592	0.071352	0.023375	0.024380	-0.018300	-0.040297	-0.046848	...	-0.040502	0.006833	-0.017652	-0.028198	-0.021668	-0.008161	-0.002029	-0.002805	-0.003878	0.000283
earliesCreditLine	-0.000937	-0.172411	-0.045828	0.113254	-0.151469	0.110683	0.113915	-0.041527	-0.218207	0.114683	...	-0.280704	-0.030076	-0.175065	-0.312749	-0.158493	-0.138735	-0.011656	-0.023992	-0.032932	0.023733
title	-0.001360	-0.011387	-0.012212	0.040267	-0.004221	-0.001433	-0.001830	0.322269	0.008424	-0.013575	...	0.035220	-0.028375	-0.012372	0.004019	0.001557	-0.027083	-0.001929	-0.003555	-0.009220	-0.040198
n0	0.000738	-0.051882	-0.017905	0.045730	-0.042105	0.050597	0.051864	-0.035091	0.015334	0.000439	...	0.068146	0.126665	-0.020640	0.083279	-0.021110	0.022838	0.035636	-0.003754	0.338738	0.087534
n1	-0.000969	0.186119	0.042089	0.028119	0.189667	0.025104	0.025982	0.009910	0.074616	-0.024495	...	0.574675	-0.031059	0.648263	0.439577	0.807070	0.532144	-0.004205	-0.005825	-0.039817	0.148759
n2	-0.001247	0.144167	0.051242	0.094440	0.151837	0.095391	0.097727	0.010425	0.124433	-0.042266	...	0.473335	0.003064	0.791554	0.563769	0.998711	0.652167	-0.002335	0.002004	-0.030003	0.235176
n2.1	-0.001247	0.144167	0.051242	0.094440	0.151837	0.095391	0.097727	0.010425	0.124433	-0.042266	...	0.473335	0.003064	0.791554	0.563769	0.998711	0.652167	-0.002335	0.002004	-0.030003	0.235176
n4	-0.000460	0.212253	0.045370	-0.056254	0.204154	-0.058516	-0.060263	-0.002499	0.062268	-0.024311	...	0.718815	-0.015563	0.775826	0.565750	0.654655	0.639183	-0.000703	-0.004735	-0.040247	0.230801
n5	0.001224	0.199807	0.050838	-0.078071	0.186735	-0.081755	-0.084117	0.035327	0.112959	-0.057957	...	1.000000	0.035081	0.623620	0.837493	0.473218	0.506993	0.007998	0.010157	0.007804	0.216891
n6	-0.000005	0.068186	0.068340	0.009437	0.052563	0.014100	0.013915	-0.040999	-0.050802	-0.059186	...	0.035081	1.000000	0.026419	0.077353	0.002960	0.398685	0.000387	0.006351	0.076549	0.192587
n7	-0.000774	0.161663	0.049326	-0.002851	0.157911	-0.000357	-0.000640	-0.010159	0.111677	-0.036628	...	0.623620	0.026419	1.000000	0.781213	0.792696	0.822737	0.003294	0.010245	-0.026240	0.343771
n8	0.000325	0.172961	0.057561	-0.050779	0.160584	-0.048262	-0.049757	0.003841	0.145870	-0.072769	...	0.837493	0.077353	0.781213	1.000000	0.564787	0.643180	0.010553	0.018527	0.019065	0.291139
n9	-0.001075	0.144367	0.051015	0.093593	0.151995	0.094926	0.097255	0.009572	0.124596	-0.042051	...	0.473218	0.002960	0.792696	0.564787	1.000000	0.653162	-0.002341	0.002088	-0.029725	0.235176
n10	-0.000716	0.169741	0.067234	-0.006641	0.159538	-0.001089	-0.001628	-0.034370	0.035308	-0.086234	...	0.506993	0.398685	0.822737	0.643180	0.653162	1.000000	0.001252	0.005800	0.013583	0.363144
n11	-0.001491	-0.000195	0.000078	0.004572	0.000639	0.005209	0.005506	-0.002532	0.003859	-0.000435	...	0.007998	0.000387	0.003294	0.010553	-0.002341	0.001252	1.000000	0.001564	0.071764	-0.005072
n12	0.001352	0.001058	-0.000664	0.015520	0.003581	0.018756	0.019117	-0.008490	0.007286	-0.010579	...	0.010157	0.006351	0.010245	0.018527	0.002088	0.005800	0.001564	1.000000	0.005785	-0.006825
n13	-0.000549	-0.019254	-0.009914	0.026478	-0.013776	0.027621	0.028449	-0.010400	-0.004126	0.003331	...	0.007804	0.076549	-0.026240	0.019065	-0.029725	0.013583	0.071764	0.005785	1.000000	-0.014610
n14	0.000910	-0.022880	0.025177	0.207397	-0.002539	0.213758	0.218171	-0.041281	0.022276	-0.029507	...	0.216891	0.192587	0.343771	0.291139	0.235176	0.363144	-0.005072	-0.006825	-0.014610	1.000000

46 rows × 46 columns

In [834]:

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 15))

sns.heatmap(correlation,
           ax=ax,
           fmt='.1f',
           xticklabels=correlation.columns,
           yticklabels=correlation.columns)

Out[834]:

<matplotlib.axes._subplots.AxesSubplot at 0x2646a7d90>

In [840]:

fig, (ax) = plt.subplots(1, 1, figsize=(18, 18))

hm = sns.heatmap(correlation, 
                 ax=ax,           
                 cmap="bwr", 
                 annot=True, 
                 fmt='.1f',       
                 linewidths=.05)

fig.subplots_adjust(top=0.94)
fig.suptitle('Attributes and their Correlation Heatmap', 
              fontsize=14, 
              fontweight='bold');

In [ ]: