In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

warnings.filterwarnings("ignore")

总体了解

这一部分主要了解数据集整体情况,包括但不限于:

  • 数据集规模
  • 特征数
  • 有哪些特征
  • 不同特征数据类型
  • 不同特征样本数
  • 不同特征均值、标准差、中位等
In [2]:
data_train = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/testA.csv")
In [3]:
data_train.shape, data_test.shape
Out[3]:
((800000, 47), (200000, 48))
In [4]:
data_train.columns
Out[4]:
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')
In [5]:
data_test.columns
Out[5]:
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1',
       'n2.2', 'n2.3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12',
       'n13', 'n14'],
      dtype='object')
In [6]:
set(data_test.columns) - set(data_train.columns) 
Out[6]:
{'n2.2', 'n2.3'}
In [7]:
set(data_train.columns) - set(data_test.columns) 
Out[7]:
{'isDefault'}
In [8]:
data_train.dtypes
Out[8]:
id                      int64
loanAmnt              float64
term                    int64
interestRate          float64
installment           float64
grade                  object
subGrade               object
employmentTitle       float64
employmentLength       object
homeOwnership           int64
annualIncome          float64
verificationStatus      int64
issueDate              object
isDefault               int64
purpose                 int64
postCode              float64
regionCode              int64
dti                   float64
delinquency_2years    float64
ficoRangeLow          float64
ficoRangeHigh         float64
openAcc               float64
pubRec                float64
pubRecBankruptcies    float64
revolBal              float64
revolUtil             float64
totalAcc              float64
initialListStatus       int64
applicationType         int64
earliesCreditLine      object
title                 float64
policyCode            float64
n0                    float64
n1                    float64
n2                    float64
n2.1                  float64
n4                    float64
n5                    float64
n6                    float64
n7                    float64
n8                    float64
n9                    float64
n10                   float64
n11                   float64
n12                   float64
n13                   float64
n14                   float64
dtype: object
In [9]:
data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id                    800000 non-null int64
loanAmnt              800000 non-null float64
term                  800000 non-null int64
interestRate          800000 non-null float64
installment           800000 non-null float64
grade                 800000 non-null object
subGrade              800000 non-null object
employmentTitle       799999 non-null float64
employmentLength      753201 non-null object
homeOwnership         800000 non-null int64
annualIncome          800000 non-null float64
verificationStatus    800000 non-null int64
issueDate             800000 non-null object
isDefault             800000 non-null int64
purpose               800000 non-null int64
postCode              799999 non-null float64
regionCode            800000 non-null int64
dti                   799761 non-null float64
delinquency_2years    800000 non-null float64
ficoRangeLow          800000 non-null float64
ficoRangeHigh         800000 non-null float64
openAcc               800000 non-null float64
pubRec                800000 non-null float64
pubRecBankruptcies    799595 non-null float64
revolBal              800000 non-null float64
revolUtil             799469 non-null float64
totalAcc              800000 non-null float64
initialListStatus     800000 non-null int64
applicationType       800000 non-null int64
earliesCreditLine     800000 non-null object
title                 799999 non-null float64
policyCode            800000 non-null float64
n0                    759730 non-null float64
n1                    759730 non-null float64
n2                    759730 non-null float64
n2.1                  759730 non-null float64
n4                    766761 non-null float64
n5                    759730 non-null float64
n6                    759730 non-null float64
n7                    759730 non-null float64
n8                    759729 non-null float64
n9                    759730 non-null float64
n10                   766761 non-null float64
n11                   730248 non-null float64
n12                   759730 non-null float64
n13                   759730 non-null float64
n14                   759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
In [10]:
data_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 48 columns):
id                    200000 non-null int64
loanAmnt              200000 non-null float64
term                  200000 non-null int64
interestRate          200000 non-null float64
installment           200000 non-null float64
grade                 200000 non-null object
subGrade              200000 non-null object
employmentTitle       200000 non-null float64
employmentLength      188258 non-null object
homeOwnership         200000 non-null int64
annualIncome          200000 non-null float64
verificationStatus    200000 non-null int64
issueDate             200000 non-null object
purpose               200000 non-null int64
postCode              200000 non-null float64
regionCode            200000 non-null int64
dti                   199939 non-null float64
delinquency_2years    200000 non-null float64
ficoRangeLow          200000 non-null float64
ficoRangeHigh         200000 non-null float64
openAcc               200000 non-null float64
pubRec                200000 non-null float64
pubRecBankruptcies    199884 non-null float64
revolBal              200000 non-null float64
revolUtil             199873 non-null float64
totalAcc              200000 non-null float64
initialListStatus     200000 non-null int64
applicationType       200000 non-null int64
earliesCreditLine     200000 non-null object
title                 200000 non-null float64
policyCode            200000 non-null float64
n0                    189889 non-null float64
n1                    189889 non-null float64
n2                    189889 non-null float64
n2.1                  189889 non-null float64
n2.2                  189889 non-null float64
n2.3                  189889 non-null float64
n4                    191606 non-null float64
n5                    189889 non-null float64
n6                    189889 non-null float64
n7                    189889 non-null float64
n8                    189889 non-null float64
n9                    189889 non-null float64
n10                   191606 non-null float64
n11                   182425 non-null float64
n12                   189889 non-null float64
n13                   189889 non-null float64
n14                   189889 non-null float64
dtypes: float64(35), int64(8), object(5)
memory usage: 73.2+ MB
In [11]:
data_train.describe().T
Out[11]:
count mean std min 25% 50% 75% max
id 800000.0 399999.500000 230940.252013 0.00 199999.75 399999.500 599999.25 799999.00
loanAmnt 800000.0 14416.818875 8716.086178 500.00 8000.00 12000.000 20000.00 40000.00
term 800000.0 3.482745 0.855832 3.00 3.00 3.000 3.00 5.00
interestRate 800000.0 13.238391 4.765757 5.31 9.75 12.740 15.99 30.99
installment 800000.0 437.947723 261.460393 15.69 248.45 375.135 580.71 1715.42
employmentTitle 799999.0 72005.351714 106585.640204 0.00 427.00 7755.000 117663.50 378351.00
homeOwnership 800000.0 0.614213 0.675749 0.00 0.00 1.000 1.00 5.00
annualIncome 800000.0 76133.910493 68947.513672 0.00 45600.00 65000.000 90000.00 10999200.00
verificationStatus 800000.0 1.009683 0.782716 0.00 0.00 1.000 2.00 2.00
isDefault 800000.0 0.199513 0.399634 0.00 0.00 0.000 0.00 1.00
purpose 800000.0 1.745982 2.367453 0.00 0.00 0.000 4.00 13.00
postCode 799999.0 258.535648 200.037446 0.00 103.00 203.000 395.00 940.00
regionCode 800000.0 16.385758 11.036679 0.00 8.00 14.000 22.00 50.00
dti 799761.0 18.284557 11.150155 -1.00 11.79 17.610 24.06 999.00
delinquency_2years 800000.0 0.318239 0.880325 0.00 0.00 0.000 0.00 39.00
ficoRangeLow 800000.0 696.204081 31.865995 630.00 670.00 690.000 710.00 845.00
ficoRangeHigh 800000.0 700.204226 31.866674 634.00 674.00 694.000 714.00 850.00
openAcc 800000.0 11.598020 5.475286 0.00 8.00 11.000 14.00 86.00
pubRec 800000.0 0.214915 0.606467 0.00 0.00 0.000 0.00 86.00
pubRecBankruptcies 799595.0 0.134163 0.377471 0.00 0.00 0.000 0.00 12.00
revolBal 800000.0 16228.706505 22458.020544 0.00 5944.00 11132.000 19734.00 2904836.00
revolUtil 799469.0 51.790734 24.516126 0.00 33.40 52.100 70.70 892.30
totalAcc 800000.0 24.998861 11.999201 2.00 16.00 23.000 32.00 162.00
initialListStatus 800000.0 0.416953 0.493055 0.00 0.00 0.000 1.00 1.00
applicationType 800000.0 0.019267 0.137464 0.00 0.00 0.000 0.00 1.00
title 799999.0 1754.113589 7941.474040 0.00 0.00 1.000 5.00 61680.00
policyCode 800000.0 1.000000 0.000000 1.00 1.00 1.000 1.00 1.00
n0 759730.0 0.511932 1.333266 0.00 0.00 0.000 0.00 51.00
n1 759730.0 3.642330 2.246825 0.00 2.00 3.000 5.00 33.00
n2 759730.0 5.642648 3.302810 0.00 3.00 5.000 7.00 63.00
n2.1 759730.0 5.642648 3.302810 0.00 3.00 5.000 7.00 63.00
n4 766761.0 4.735641 2.949969 0.00 3.00 4.000 6.00 49.00
n5 759730.0 8.107937 4.799210 0.00 5.00 7.000 11.00 70.00
n6 759730.0 8.575994 7.400536 0.00 4.00 7.000 11.00 132.00
n7 759730.0 8.282953 4.561689 0.00 5.00 7.000 10.00 79.00
n8 759729.0 14.622488 8.124610 1.00 9.00 13.000 19.00 128.00
n9 759730.0 5.592345 3.216184 0.00 3.00 5.000 7.00 45.00
n10 766761.0 11.643896 5.484104 0.00 8.00 11.000 14.00 82.00
n11 730248.0 0.000815 0.030075 0.00 0.00 0.000 0.00 4.00
n12 759730.0 0.003384 0.062041 0.00 0.00 0.000 0.00 4.00
n13 759730.0 0.089366 0.509069 0.00 0.00 0.000 0.00 39.00
n14 759730.0 2.178606 1.844377 0.00 1.00 2.000 3.00 30.00
In [12]:
data_train.head().append(data_train.tail()).T
Out[12]:
0 1 2 3 4 799995 799996 799997 799998 799999
id 0 1 2 3 4 799995 799996 799997 799998 799999
loanAmnt 35000 18000 12000 11000 3000 25000 17000 6000 19200 9000
term 5 5 5 3 3 3 3 3 3 3
interestRate 19.52 18.49 16.99 7.26 12.99 14.49 7.9 13.33 6.92 11.06
installment 917.97 461.9 298.17 340.96 101.07 860.41 531.94 203.12 592.14 294.91
grade E D D A C C A C A B
subGrade E2 D2 D3 A4 C2 C4 A4 C3 A4 B3
employmentTitle 320 219843 31698 46854 54 2659 29205 2582 151 13
employmentLength 2 years 5 years 8 years 10+ years NaN 7 years 10+ years 10+ years 10+ years 5 years
homeOwnership 2 0 0 1 1 1 0 1 0 0
annualIncome 110000 46000 74000 118000 29000 72000 99000 65000 96000 120000
verificationStatus 2 2 2 1 2 0 2 2 2 0
issueDate 2014-07-01 2012-08-01 2015-10-01 2015-08-01 2016-03-01 2016-07-01 2013-04-01 2015-10-01 2015-02-01 2018-08-01
isDefault 1 0 0 0 0 0 0 1 0 0
purpose 1 0 0 4 10 0 4 0 4 4
postCode 137 156 337 148 301 242 563 47 34 62
regionCode 32 18 14 11 21 8 10 17 18 13
dti 17.05 27.83 22.77 17.21 32.16 19.03 15.72 12.11 29.25 8.99
delinquency_2years 0 0 0 0 0 0 1 1 0 0
ficoRangeLow 730 700 675 685 690 710 720 670 675 695
ficoRangeHigh 734 704 679 689 694 714 724 674 679 699
openAcc 7 13 11 9 12 14 7 5 16 7
pubRec 0 0 0 0 0 0 0 0 0 0
pubRecBankruptcies 0 0 0 0 0 0 0 0 0 0
revolBal 24178 15096 4606 9948 2942 9933 20472 6381 69702 8420
revolUtil 48.9 38.9 51.8 52.6 32 46.4 98.4 51.9 61.3 72.6
totalAcc 27 18 27 28 27 15 42 36 37 13
initialListStatus 0 1 0 1 0 1 0 1 1 0
applicationType 0 0 0 0 0 0 0 0 0 0
earliesCreditLine Aug-2001 May-2002 May-2006 May-1999 Aug-1977 Aug-2011 May-1989 Jul-2002 Jan-1994 Feb-2002
title 1 1723 0 4 11 0 33369 0 4 4
policyCode 1 1 1 1 1 1 1 1 1 1
n0 0 NaN 0 6 1 0 0 2 0 2
n1 2 NaN 0 4 2 5 2 1 5 2
n2 2 NaN 3 6 7 10 2 4 8 3
n2.1 2 NaN 3 6 7 10 2 4 8 3
n4 4 10 0 4 2 6 2 1 7 2
n5 9 NaN 0 16 4 6 15 4 10 3
n6 8 NaN 21 4 9 2 16 26 6 4
n7 4 NaN 4 7 10 12 2 4 12 4
n8 12 NaN 5 21 15 13 19 10 22 8
n9 2 NaN 3 6 7 10 2 4 8 3
n10 7 13 11 9 12 14 7 5 16 7
n11 0 NaN 0 0 0 0 0 0 0 0
n12 0 NaN 0 0 0 0 0 0 0 0
n13 0 NaN 0 0 0 0 0 1 0 0
n14 2 NaN 4 1 4 3 0 4 5 2

缺失值和唯一值

主要探索:

  • 如何处理特征的缺失值?
  • 如何处理唯一值的特征?

缺失值

In [13]:
# 判断有多少个缺失特征
n_missing = data_train.isna().any().sum()
# 类似:data_train.isnull().any().sum()
n_missing
Out[13]:
22
In [14]:
def feature_null_more_than_ratio(data: pd.DataFrame, ratio: float) -> dict:
    have_null_fea_dict = (data.isnull().sum()/len(data)).to_dict()
    res = {}
    for key,val in have_null_fea_dict.items():
        if val > ratio:
            res[key] = val
    return res
In [15]:
# 缺失值超过一定比例的特征
for ratio in np.linspace(0.1, 1, 10):
    dct = feature_null_more_than_ratio(data_train, ratio)
    print("features null ratio bigger than {}: {}".format(ratio, dct))
features null ratio bigger than 0.1: {}
features null ratio bigger than 0.2: {}
features null ratio bigger than 0.30000000000000004: {}
features null ratio bigger than 0.4: {}
features null ratio bigger than 0.5: {}
features null ratio bigger than 0.6: {}
features null ratio bigger than 0.7000000000000001: {}
features null ratio bigger than 0.8: {}
features null ratio bigger than 0.9: {}
features null ratio bigger than 1.0: {}
In [16]:
# 不同特征缺失值比例
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 4))

missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(ax = ax);
In [278]:
# 有缺失值的特征数据分布
fig, axes = plt.subplots(nrows=n_missing, ncols=1, figsize=(12, 3*n_missing))
for i, (key, val) in enumerate(missing.to_dict().items()):
    val = int(val * len(data_train))
    axes[i].hist(data_train[key].dropna());
    axes[i].set_title(key +"/" + str(val), loc='right', pad=1.0);

对于缺失值的处理:

  • employmentTitle,postCode,title:众数
  • employmentLength:转换后平均或分桶后取样本数最多的桶
  • dti,revolUtil::加权平均
  • pubRecBankruptcies,n14,n13,n12,n11,n10,n9,n8,n7,n6,n5,n4,n2,n1,n0:加权平均后取整
In [96]:
fea = "employmentLength"
data_train[fea].dropna().value_counts()
Out[96]:
10+ years    262753
2 years       72358
< 1 year      64237
3 years       64152
1 year        52489
5 years       50102
4 years       47985
6 years       37254
8 years       36192
7 years       35407
9 years       30272
Name: employmentLength, dtype: int64

唯一值

对唯一值直接去掉:

  • policyCode
In [97]:
[col for col in data_train.columns if data_train[col].nunique() <= 1]
Out[97]:
['policyCode']
In [100]:
[col for col in data_test.columns if data_test[col].nunique() <= 1]
Out[100]:
['policyCode']

数据类型

数据特征类型一般包括以下几类:

  • 数值数据:定量数据
    • 离散数据
    • 连续数据
  • 分类数据:定性数据

度量数据有以下不同方法:

  • 标签
  • 序数
  • 区间
  • 比率

数值数据一般可以直接用于模型,但风控时往往要对其进行分桶,对其 WOE 编码后再使用,这里提到过,这其实是一种非线性的思路,能够降低模型复杂度,减少噪声影响,使得模型更加稳定。

主要探索:

  • 数据分布如何?
  • 如何转换数据?
In [558]:
# unique
for fea in data_train.columns:
    print("Feature:", fea, "\t",
          "Number Unique:", len(data_train[fea].unique()), "\t",
          "Sample:", data_train[fea].iloc[0])
Feature: id 	 Number Unique: 800000 	 Sample: 0
Feature: loanAmnt 	 Number Unique: 1540 	 Sample: 35000.0
Feature: term 	 Number Unique: 2 	 Sample: 5
Feature: interestRate 	 Number Unique: 641 	 Sample: 19.52
Feature: installment 	 Number Unique: 72360 	 Sample: 917.97
Feature: grade 	 Number Unique: 7 	 Sample: E
Feature: subGrade 	 Number Unique: 35 	 Sample: E2
Feature: employmentTitle 	 Number Unique: 248684 	 Sample: 320.0
Feature: employmentLength 	 Number Unique: 12 	 Sample: 2 years
Feature: homeOwnership 	 Number Unique: 6 	 Sample: 2
Feature: annualIncome 	 Number Unique: 44926 	 Sample: 110000.0
Feature: verificationStatus 	 Number Unique: 3 	 Sample: 2
Feature: issueDate 	 Number Unique: 139 	 Sample: 2014-07-01
Feature: isDefault 	 Number Unique: 2 	 Sample: 1
Feature: purpose 	 Number Unique: 14 	 Sample: 1
Feature: postCode 	 Number Unique: 933 	 Sample: 137.0
Feature: regionCode 	 Number Unique: 51 	 Sample: 32
Feature: dti 	 Number Unique: 6322 	 Sample: 17.05
Feature: delinquency_2years 	 Number Unique: 30 	 Sample: 0.0
Feature: ficoRangeLow 	 Number Unique: 39 	 Sample: 730.0
Feature: ficoRangeHigh 	 Number Unique: 39 	 Sample: 734.0
Feature: openAcc 	 Number Unique: 75 	 Sample: 7.0
Feature: pubRec 	 Number Unique: 32 	 Sample: 0.0
Feature: pubRecBankruptcies 	 Number Unique: 12 	 Sample: 0.0
Feature: revolBal 	 Number Unique: 71116 	 Sample: 24178.0
Feature: revolUtil 	 Number Unique: 1287 	 Sample: 48.9
Feature: totalAcc 	 Number Unique: 134 	 Sample: 27.0
Feature: initialListStatus 	 Number Unique: 2 	 Sample: 0
Feature: applicationType 	 Number Unique: 2 	 Sample: 0
Feature: earliesCreditLine 	 Number Unique: 720 	 Sample: Aug-2001
Feature: title 	 Number Unique: 39645 	 Sample: 1.0
Feature: policyCode 	 Number Unique: 1 	 Sample: 1.0
Feature: n0 	 Number Unique: 40 	 Sample: 0.0
Feature: n1 	 Number Unique: 34 	 Sample: 2.0
Feature: n2 	 Number Unique: 51 	 Sample: 2.0
Feature: n2.1 	 Number Unique: 51 	 Sample: 2.0
Feature: n4 	 Number Unique: 47 	 Sample: 4.0
Feature: n5 	 Number Unique: 66 	 Sample: 9.0
Feature: n6 	 Number Unique: 108 	 Sample: 8.0
Feature: n7 	 Number Unique: 71 	 Sample: 4.0
Feature: n8 	 Number Unique: 103 	 Sample: 12.0
Feature: n9 	 Number Unique: 45 	 Sample: 2.0
Feature: n10 	 Number Unique: 77 	 Sample: 7.0
Feature: n11 	 Number Unique: 6 	 Sample: 0.0
Feature: n12 	 Number Unique: 6 	 Sample: 0.0
Feature: n13 	 Number Unique: 29 	 Sample: 0.0
Feature: n14 	 Number Unique: 32 	 Sample: 2.0
In [281]:
# 不同数据类型
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
print("Numerical Features: ", len(numerical_fea))
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
print("Category Features: ", len(category_fea))
for fea in category_fea:
    print("Feature: ", fea, "\t",
          "Number Unique:", len(data_train[fea].unique()), "\t",
          "Sample: ", data_train[fea].iloc[0])
Numerical Features:  42
Category Features:  5
Feature:  grade 	 Number Unique: 7 	 Sample:  E
Feature:  subGrade 	 Number Unique: 35 	 Sample:  E2
Feature:  employmentLength 	 Number Unique: 12 	 Sample:  2 years
Feature:  issueDate 	 Number Unique: 139 	 Sample:  2014-07-01
Feature:  earliesCreditLine 	 Number Unique: 720 	 Sample:  Aug-2001
In [763]:
def is_float(x):
    return bool(int(x) - x)

def all_unique_is_float(arr):
    for i in arr:
        if is_float(i):
            return True
    return False
In [775]:
# 离散/连续数据类型
# numerical_serial_fea = [fea for fea in numerical_fea if data_train[fea].nunique() > 10]
numerical_serial_fea = [fea for fea in numerical_fea if 
                        all_unique_is_float(data_train[fea].dropna().unique())
                       or data_train[fea].nunique() > 50]
numerical_discrate_fea = [fea for fea in numerical_fea if fea not in numerical_serial_fea]
In [776]:
numerical_discrate_fea
Out[776]:
['term',
 'homeOwnership',
 'verificationStatus',
 'isDefault',
 'purpose',
 'delinquency_2years',
 'ficoRangeLow',
 'ficoRangeHigh',
 'pubRec',
 'pubRecBankruptcies',
 'initialListStatus',
 'applicationType',
 'policyCode',
 'n0',
 'n1',
 'n2',
 'n2.1',
 'n4',
 'n9',
 'n11',
 'n12',
 'n13',
 'n14']
In [777]:
numerical_serial_fea
Out[777]:
['id',
 'loanAmnt',
 'interestRate',
 'installment',
 'employmentTitle',
 'annualIncome',
 'postCode',
 'regionCode',
 'dti',
 'openAcc',
 'revolBal',
 'revolUtil',
 'totalAcc',
 'title',
 'n5',
 'n6',
 'n7',
 'n8',
 'n10']
In [656]:
# value counts
vc_features = features #category_fea + numerical_discrate_fea
rows = len(vc_features)
fig, axes = plt.subplots(nrows=rows, ncols=1, figsize=(12, 4*rows))
for i, fea in enumerate(vc_features):
#     sns.barplot(
#         x=list(data_train[fea].value_counts())[:10], 
#         y=data_train[fea].value_counts().keys()[:10],
#         ax = axes[i]
#     )
    data_train[fea].value_counts().nlargest(10).plot(
        kind='barh', ax=axes[i])
    axes[i].set_title(fea)
In [371]:
# 看分布(离散/连续数据类型)进一步验证
features = category_fea + numerical_discrate_fea + numerical_serial_fea
df = data_train[features]
In [559]:
figsize = (18, 40)
cols = 4
rows = len(features) // cols + 1

def trim_axs(axs, N):
    axs = axs.flat
    for ax in axs[N:]:
        ax.remove()
    return axs[:N]
In [569]:
def plot_hist(fea_data, ax):
    if fea_data.dtype == "object":
        ax.hist(fea_data, bins=15)
    else:
        try:
            sns.distplot(fea_data, ax=ax, axlabel=False)
        except Exception as e:
            ax.hist(fea_data, bins=15)
In [570]:
axs = plt.figure(figsize=figsize, constrained_layout=False).subplots(rows, cols)
axs = trim_axs(axs, len(features))

for ax, fea in zip(axs, features):
    fea_data = df[fea].dropna()
    ax.set_title(fea)
    plot_hist(fea_data, ax)

类别数据:

  • grade 贷款等级
  • subGrade 贷款等级之子级
  • employmentLength 就业年限(年)
  • issueDate 贷款发放的月份
  • earliesCreditLine 借款人最早报告的信用额度开立的月份

数值数据:

  • id
  • loanAmnt 贷款金额
  • interestRate 贷款利率
  • installment 分期付款金额
  • employmentTitle 就业职称
  • annualIncome 年收入
  • postCode 借款人在贷款申请时的贷款用途类别
  • regionCode 地区编码
  • purpose 借款人在贷款申请时的贷款用途类别
  • dti 债务收入比
  • delinquency_2years 借款人过去2年信用档案中逾期30天以上的违约事件数
  • ficoRangeLow 借款人在贷款发放时的fico所属的下限范围
  • ficoRangeHigh 借款人在贷款发放时的fico所属的上限范围
  • openAcc 借款人信用档案中未结信用额度的数量
  • pubRec 贬损公共记录的数量
  • pubRecBankruptcies 公开记录清除的数量
  • revolBal 信贷周转余额合计
  • revolUtil 循环额度利用率,或借款人使用的相对于所有可用循环信贷的信贷金额
  • totalAcc 借款人信用档案中当前的信用额度总数
  • title 借款人提供的贷款名称
  • n 系列:n0 n1 n2 n2.1 n4 n5 n6 n7 n8 n9 n10 n13 n14
  • term 贷款期限
  • homeOwnership 借款人在登记时提供的房屋所有权状况
  • verificationStatus 验证状态
  • initialListStatus 贷款的初始列表状态
  • applicationType 表明贷款是个人申请还是与两个共同借款人的联合申请
  • n 系列:n11 n12
In [435]:
set(data_test["purpose"].dropna().to_list())
Out[435]:
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}
In [466]:
# 偏态
skews = data_train[numerical_serial_fea].skew()
skews
Out[466]:
id                    1.586291e-17
loanAmnt              7.826221e-01
interestRate          7.123646e-01
installment           1.006328e+00
employmentTitle       1.379124e+00
annualIncome          4.618773e+01
purpose               1.251126e+00
postCode              8.321358e-01
regionCode            7.824390e-01
dti                   2.730132e+01
delinquency_2years    5.685041e+00
ficoRangeLow          1.282404e+00
ficoRangeHigh         1.282608e+00
openAcc               1.298413e+00
pubRec                1.352173e+01
pubRecBankruptcies    3.440526e+00
revolBal              1.592468e+01
revolUtil            -1.805942e-02
totalAcc              9.560535e-01
title                 5.225696e+00
n0                    5.106909e+00
n1                    1.400085e+00
n2                    1.542892e+00
n2.1                  1.542892e+00
n4                    1.701304e+00
n5                    1.339371e+00
n6                    2.063377e+00
n7                    1.471781e+00
n8                    1.303497e+00
n9                    1.432905e+00
n10                   1.296988e+00
n13                   1.363116e+01
n14                   1.486073e+00
dtype: float64
In [526]:
def addone(x):
    return x + 1e-5
In [539]:
new_data
Out[539]:
0         2.836151
1         3.326115
2         3.125444
3         2.845491
4         3.470724
            ...   
799995    2.946017
799996    2.754934
799997    2.494032
799998    3.375880
799999    2.196114
Name: dti, Length: 799759, dtype: float64
In [546]:
# 非正态的尝试 log 化
# sns.FacetGrid(df, size=2).map(sns.distplot, fea, ax=ax).add_legend()
skew_dict = skews.to_dict()
fig, axes = plt.subplots(nrows=len(skew_dict), ncols=2, figsize=(12, 3*len(skew_dict)))

for i, (key, val) in enumerate(skew_dict.items()):
    fea_data = df[key].dropna()
    plot_hist(fea_data, ax=axes[i, 0])
    axes[i, 0].set_title(key +"/" + str(val), loc='right', pad=1.0)
    
    new_data = fea_data.dropna().apply(addone).apply(np.log).dropna()
    plot_hist(new_data, ax=axes[i, 1])
    new_val = new_data.skew()
    axes[i, 1].set_title(key +"/" + str(new_val), loc='right', pad=1.0)