本文数据来自:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings("ignore")
data_train = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/testA.csv")
data_train.shape, data_test.shape
((800000, 47), (200000, 48))
data_train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade', 'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership', 'annualIncome', 'verificationStatus', 'issueDate', 'isDefault', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'], dtype='object')
data_test.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade', 'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership', 'annualIncome', 'verificationStatus', 'issueDate', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n2.2', 'n2.3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'], dtype='object')
set(data_test.columns) - set(data_train.columns)
{'n2.2', 'n2.3'}
set(data_train.columns) - set(data_test.columns)
{'isDefault'}
data_train.dtypes
id int64 loanAmnt float64 term int64 interestRate float64 installment float64 grade object subGrade object employmentTitle float64 employmentLength object homeOwnership int64 annualIncome float64 verificationStatus int64 issueDate object isDefault int64 purpose int64 postCode float64 regionCode int64 dti float64 delinquency_2years float64 ficoRangeLow float64 ficoRangeHigh float64 openAcc float64 pubRec float64 pubRecBankruptcies float64 revolBal float64 revolUtil float64 totalAcc float64 initialListStatus int64 applicationType int64 earliesCreditLine object title float64 policyCode float64 n0 float64 n1 float64 n2 float64 n2.1 float64 n4 float64 n5 float64 n6 float64 n7 float64 n8 float64 n9 float64 n10 float64 n11 float64 n12 float64 n13 float64 n14 float64 dtype: object
data_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 800000 entries, 0 to 799999 Data columns (total 47 columns): id 800000 non-null int64 loanAmnt 800000 non-null float64 term 800000 non-null int64 interestRate 800000 non-null float64 installment 800000 non-null float64 grade 800000 non-null object subGrade 800000 non-null object employmentTitle 799999 non-null float64 employmentLength 753201 non-null object homeOwnership 800000 non-null int64 annualIncome 800000 non-null float64 verificationStatus 800000 non-null int64 issueDate 800000 non-null object isDefault 800000 non-null int64 purpose 800000 non-null int64 postCode 799999 non-null float64 regionCode 800000 non-null int64 dti 799761 non-null float64 delinquency_2years 800000 non-null float64 ficoRangeLow 800000 non-null float64 ficoRangeHigh 800000 non-null float64 openAcc 800000 non-null float64 pubRec 800000 non-null float64 pubRecBankruptcies 799595 non-null float64 revolBal 800000 non-null float64 revolUtil 799469 non-null float64 totalAcc 800000 non-null float64 initialListStatus 800000 non-null int64 applicationType 800000 non-null int64 earliesCreditLine 800000 non-null object title 799999 non-null float64 policyCode 800000 non-null float64 n0 759730 non-null float64 n1 759730 non-null float64 n2 759730 non-null float64 n2.1 759730 non-null float64 n4 766761 non-null float64 n5 759730 non-null float64 n6 759730 non-null float64 n7 759730 non-null float64 n8 759729 non-null float64 n9 759730 non-null float64 n10 766761 non-null float64 n11 730248 non-null float64 n12 759730 non-null float64 n13 759730 non-null float64 n14 759730 non-null float64 dtypes: float64(33), int64(9), object(5) memory usage: 286.9+ MB
data_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200000 entries, 0 to 199999 Data columns (total 48 columns): id 200000 non-null int64 loanAmnt 200000 non-null float64 term 200000 non-null int64 interestRate 200000 non-null float64 installment 200000 non-null float64 grade 200000 non-null object subGrade 200000 non-null object employmentTitle 200000 non-null float64 employmentLength 188258 non-null object homeOwnership 200000 non-null int64 annualIncome 200000 non-null float64 verificationStatus 200000 non-null int64 issueDate 200000 non-null object purpose 200000 non-null int64 postCode 200000 non-null float64 regionCode 200000 non-null int64 dti 199939 non-null float64 delinquency_2years 200000 non-null float64 ficoRangeLow 200000 non-null float64 ficoRangeHigh 200000 non-null float64 openAcc 200000 non-null float64 pubRec 200000 non-null float64 pubRecBankruptcies 199884 non-null float64 revolBal 200000 non-null float64 revolUtil 199873 non-null float64 totalAcc 200000 non-null float64 initialListStatus 200000 non-null int64 applicationType 200000 non-null int64 earliesCreditLine 200000 non-null object title 200000 non-null float64 policyCode 200000 non-null float64 n0 189889 non-null float64 n1 189889 non-null float64 n2 189889 non-null float64 n2.1 189889 non-null float64 n2.2 189889 non-null float64 n2.3 189889 non-null float64 n4 191606 non-null float64 n5 189889 non-null float64 n6 189889 non-null float64 n7 189889 non-null float64 n8 189889 non-null float64 n9 189889 non-null float64 n10 191606 non-null float64 n11 182425 non-null float64 n12 189889 non-null float64 n13 189889 non-null float64 n14 189889 non-null float64 dtypes: float64(35), int64(8), object(5) memory usage: 73.2+ MB
data_train.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 800000.0 | 399999.500000 | 230940.252013 | 0.00 | 199999.75 | 399999.500 | 599999.25 | 799999.00 |
loanAmnt | 800000.0 | 14416.818875 | 8716.086178 | 500.00 | 8000.00 | 12000.000 | 20000.00 | 40000.00 |
term | 800000.0 | 3.482745 | 0.855832 | 3.00 | 3.00 | 3.000 | 3.00 | 5.00 |
interestRate | 800000.0 | 13.238391 | 4.765757 | 5.31 | 9.75 | 12.740 | 15.99 | 30.99 |
installment | 800000.0 | 437.947723 | 261.460393 | 15.69 | 248.45 | 375.135 | 580.71 | 1715.42 |
employmentTitle | 799999.0 | 72005.351714 | 106585.640204 | 0.00 | 427.00 | 7755.000 | 117663.50 | 378351.00 |
homeOwnership | 800000.0 | 0.614213 | 0.675749 | 0.00 | 0.00 | 1.000 | 1.00 | 5.00 |
annualIncome | 800000.0 | 76133.910493 | 68947.513672 | 0.00 | 45600.00 | 65000.000 | 90000.00 | 10999200.00 |
verificationStatus | 800000.0 | 1.009683 | 0.782716 | 0.00 | 0.00 | 1.000 | 2.00 | 2.00 |
isDefault | 800000.0 | 0.199513 | 0.399634 | 0.00 | 0.00 | 0.000 | 0.00 | 1.00 |
purpose | 800000.0 | 1.745982 | 2.367453 | 0.00 | 0.00 | 0.000 | 4.00 | 13.00 |
postCode | 799999.0 | 258.535648 | 200.037446 | 0.00 | 103.00 | 203.000 | 395.00 | 940.00 |
regionCode | 800000.0 | 16.385758 | 11.036679 | 0.00 | 8.00 | 14.000 | 22.00 | 50.00 |
dti | 799761.0 | 18.284557 | 11.150155 | -1.00 | 11.79 | 17.610 | 24.06 | 999.00 |
delinquency_2years | 800000.0 | 0.318239 | 0.880325 | 0.00 | 0.00 | 0.000 | 0.00 | 39.00 |
ficoRangeLow | 800000.0 | 696.204081 | 31.865995 | 630.00 | 670.00 | 690.000 | 710.00 | 845.00 |
ficoRangeHigh | 800000.0 | 700.204226 | 31.866674 | 634.00 | 674.00 | 694.000 | 714.00 | 850.00 |
openAcc | 800000.0 | 11.598020 | 5.475286 | 0.00 | 8.00 | 11.000 | 14.00 | 86.00 |
pubRec | 800000.0 | 0.214915 | 0.606467 | 0.00 | 0.00 | 0.000 | 0.00 | 86.00 |
pubRecBankruptcies | 799595.0 | 0.134163 | 0.377471 | 0.00 | 0.00 | 0.000 | 0.00 | 12.00 |
revolBal | 800000.0 | 16228.706505 | 22458.020544 | 0.00 | 5944.00 | 11132.000 | 19734.00 | 2904836.00 |
revolUtil | 799469.0 | 51.790734 | 24.516126 | 0.00 | 33.40 | 52.100 | 70.70 | 892.30 |
totalAcc | 800000.0 | 24.998861 | 11.999201 | 2.00 | 16.00 | 23.000 | 32.00 | 162.00 |
initialListStatus | 800000.0 | 0.416953 | 0.493055 | 0.00 | 0.00 | 0.000 | 1.00 | 1.00 |
applicationType | 800000.0 | 0.019267 | 0.137464 | 0.00 | 0.00 | 0.000 | 0.00 | 1.00 |
title | 799999.0 | 1754.113589 | 7941.474040 | 0.00 | 0.00 | 1.000 | 5.00 | 61680.00 |
policyCode | 800000.0 | 1.000000 | 0.000000 | 1.00 | 1.00 | 1.000 | 1.00 | 1.00 |
n0 | 759730.0 | 0.511932 | 1.333266 | 0.00 | 0.00 | 0.000 | 0.00 | 51.00 |
n1 | 759730.0 | 3.642330 | 2.246825 | 0.00 | 2.00 | 3.000 | 5.00 | 33.00 |
n2 | 759730.0 | 5.642648 | 3.302810 | 0.00 | 3.00 | 5.000 | 7.00 | 63.00 |
n2.1 | 759730.0 | 5.642648 | 3.302810 | 0.00 | 3.00 | 5.000 | 7.00 | 63.00 |
n4 | 766761.0 | 4.735641 | 2.949969 | 0.00 | 3.00 | 4.000 | 6.00 | 49.00 |
n5 | 759730.0 | 8.107937 | 4.799210 | 0.00 | 5.00 | 7.000 | 11.00 | 70.00 |
n6 | 759730.0 | 8.575994 | 7.400536 | 0.00 | 4.00 | 7.000 | 11.00 | 132.00 |
n7 | 759730.0 | 8.282953 | 4.561689 | 0.00 | 5.00 | 7.000 | 10.00 | 79.00 |
n8 | 759729.0 | 14.622488 | 8.124610 | 1.00 | 9.00 | 13.000 | 19.00 | 128.00 |
n9 | 759730.0 | 5.592345 | 3.216184 | 0.00 | 3.00 | 5.000 | 7.00 | 45.00 |
n10 | 766761.0 | 11.643896 | 5.484104 | 0.00 | 8.00 | 11.000 | 14.00 | 82.00 |
n11 | 730248.0 | 0.000815 | 0.030075 | 0.00 | 0.00 | 0.000 | 0.00 | 4.00 |
n12 | 759730.0 | 0.003384 | 0.062041 | 0.00 | 0.00 | 0.000 | 0.00 | 4.00 |
n13 | 759730.0 | 0.089366 | 0.509069 | 0.00 | 0.00 | 0.000 | 0.00 | 39.00 |
n14 | 759730.0 | 2.178606 | 1.844377 | 0.00 | 1.00 | 2.000 | 3.00 | 30.00 |
data_train.head().append(data_train.tail()).T
0 | 1 | 2 | 3 | 4 | 799995 | 799996 | 799997 | 799998 | 799999 | |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 2 | 3 | 4 | 799995 | 799996 | 799997 | 799998 | 799999 |
loanAmnt | 35000 | 18000 | 12000 | 11000 | 3000 | 25000 | 17000 | 6000 | 19200 | 9000 |
term | 5 | 5 | 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
interestRate | 19.52 | 18.49 | 16.99 | 7.26 | 12.99 | 14.49 | 7.9 | 13.33 | 6.92 | 11.06 |
installment | 917.97 | 461.9 | 298.17 | 340.96 | 101.07 | 860.41 | 531.94 | 203.12 | 592.14 | 294.91 |
grade | E | D | D | A | C | C | A | C | A | B |
subGrade | E2 | D2 | D3 | A4 | C2 | C4 | A4 | C3 | A4 | B3 |
employmentTitle | 320 | 219843 | 31698 | 46854 | 54 | 2659 | 29205 | 2582 | 151 | 13 |
employmentLength | 2 years | 5 years | 8 years | 10+ years | NaN | 7 years | 10+ years | 10+ years | 10+ years | 5 years |
homeOwnership | 2 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 |
annualIncome | 110000 | 46000 | 74000 | 118000 | 29000 | 72000 | 99000 | 65000 | 96000 | 120000 |
verificationStatus | 2 | 2 | 2 | 1 | 2 | 0 | 2 | 2 | 2 | 0 |
issueDate | 2014-07-01 | 2012-08-01 | 2015-10-01 | 2015-08-01 | 2016-03-01 | 2016-07-01 | 2013-04-01 | 2015-10-01 | 2015-02-01 | 2018-08-01 |
isDefault | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
purpose | 1 | 0 | 0 | 4 | 10 | 0 | 4 | 0 | 4 | 4 |
postCode | 137 | 156 | 337 | 148 | 301 | 242 | 563 | 47 | 34 | 62 |
regionCode | 32 | 18 | 14 | 11 | 21 | 8 | 10 | 17 | 18 | 13 |
dti | 17.05 | 27.83 | 22.77 | 17.21 | 32.16 | 19.03 | 15.72 | 12.11 | 29.25 | 8.99 |
delinquency_2years | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
ficoRangeLow | 730 | 700 | 675 | 685 | 690 | 710 | 720 | 670 | 675 | 695 |
ficoRangeHigh | 734 | 704 | 679 | 689 | 694 | 714 | 724 | 674 | 679 | 699 |
openAcc | 7 | 13 | 11 | 9 | 12 | 14 | 7 | 5 | 16 | 7 |
pubRec | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
pubRecBankruptcies | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
revolBal | 24178 | 15096 | 4606 | 9948 | 2942 | 9933 | 20472 | 6381 | 69702 | 8420 |
revolUtil | 48.9 | 38.9 | 51.8 | 52.6 | 32 | 46.4 | 98.4 | 51.9 | 61.3 | 72.6 |
totalAcc | 27 | 18 | 27 | 28 | 27 | 15 | 42 | 36 | 37 | 13 |
initialListStatus | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 |
applicationType | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
earliesCreditLine | Aug-2001 | May-2002 | May-2006 | May-1999 | Aug-1977 | Aug-2011 | May-1989 | Jul-2002 | Jan-1994 | Feb-2002 |
title | 1 | 1723 | 0 | 4 | 11 | 0 | 33369 | 0 | 4 | 4 |
policyCode | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
n0 | 0 | NaN | 0 | 6 | 1 | 0 | 0 | 2 | 0 | 2 |
n1 | 2 | NaN | 0 | 4 | 2 | 5 | 2 | 1 | 5 | 2 |
n2 | 2 | NaN | 3 | 6 | 7 | 10 | 2 | 4 | 8 | 3 |
n2.1 | 2 | NaN | 3 | 6 | 7 | 10 | 2 | 4 | 8 | 3 |
n4 | 4 | 10 | 0 | 4 | 2 | 6 | 2 | 1 | 7 | 2 |
n5 | 9 | NaN | 0 | 16 | 4 | 6 | 15 | 4 | 10 | 3 |
n6 | 8 | NaN | 21 | 4 | 9 | 2 | 16 | 26 | 6 | 4 |
n7 | 4 | NaN | 4 | 7 | 10 | 12 | 2 | 4 | 12 | 4 |
n8 | 12 | NaN | 5 | 21 | 15 | 13 | 19 | 10 | 22 | 8 |
n9 | 2 | NaN | 3 | 6 | 7 | 10 | 2 | 4 | 8 | 3 |
n10 | 7 | 13 | 11 | 9 | 12 | 14 | 7 | 5 | 16 | 7 |
n11 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
n12 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
n13 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
n14 | 2 | NaN | 4 | 1 | 4 | 3 | 0 | 4 | 5 | 2 |
# 判断有多少个缺失特征
n_missing = data_train.isna().any().sum()
# 类似:data_train.isnull().any().sum()
n_missing
22
def feature_null_more_than_ratio(data: pd.DataFrame, ratio: float) -> dict:
have_null_fea_dict = (data.isnull().sum()/len(data)).to_dict()
res = {}
for key,val in have_null_fea_dict.items():
if val > ratio:
res[key] = val
return res
# 缺失值超过一定比例的特征
for ratio in np.linspace(0.1, 1, 10):
dct = feature_null_more_than_ratio(data_train, ratio)
print("features null ratio bigger than {}: {}".format(ratio, dct))
features null ratio bigger than 0.1: {} features null ratio bigger than 0.2: {} features null ratio bigger than 0.30000000000000004: {} features null ratio bigger than 0.4: {} features null ratio bigger than 0.5: {} features null ratio bigger than 0.6: {} features null ratio bigger than 0.7000000000000001: {} features null ratio bigger than 0.8: {} features null ratio bigger than 0.9: {} features null ratio bigger than 1.0: {}
# 不同特征缺失值比例
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 4))
missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(ax = ax);
# 有缺失值的特征数据分布
fig, axes = plt.subplots(nrows=n_missing, ncols=1, figsize=(12, 3*n_missing))
for i, (key, val) in enumerate(missing.to_dict().items()):
val = int(val * len(data_train))
axes[i].hist(data_train[key].dropna());
axes[i].set_title(key +"/" + str(val), loc='right', pad=1.0);
对于缺失值的处理:
fea = "employmentLength"
data_train[fea].dropna().value_counts()
10+ years 262753 2 years 72358 < 1 year 64237 3 years 64152 1 year 52489 5 years 50102 4 years 47985 6 years 37254 8 years 36192 7 years 35407 9 years 30272 Name: employmentLength, dtype: int64
[col for col in data_train.columns if data_train[col].nunique() <= 1]
['policyCode']
[col for col in data_test.columns if data_test[col].nunique() <= 1]
['policyCode']
# unique
for fea in data_train.columns:
print("Feature:", fea, "\t",
"Number Unique:", len(data_train[fea].unique()), "\t",
"Sample:", data_train[fea].iloc[0])
Feature: id Number Unique: 800000 Sample: 0 Feature: loanAmnt Number Unique: 1540 Sample: 35000.0 Feature: term Number Unique: 2 Sample: 5 Feature: interestRate Number Unique: 641 Sample: 19.52 Feature: installment Number Unique: 72360 Sample: 917.97 Feature: grade Number Unique: 7 Sample: E Feature: subGrade Number Unique: 35 Sample: E2 Feature: employmentTitle Number Unique: 248684 Sample: 320.0 Feature: employmentLength Number Unique: 12 Sample: 2 years Feature: homeOwnership Number Unique: 6 Sample: 2 Feature: annualIncome Number Unique: 44926 Sample: 110000.0 Feature: verificationStatus Number Unique: 3 Sample: 2 Feature: issueDate Number Unique: 139 Sample: 2014-07-01 Feature: isDefault Number Unique: 2 Sample: 1 Feature: purpose Number Unique: 14 Sample: 1 Feature: postCode Number Unique: 933 Sample: 137.0 Feature: regionCode Number Unique: 51 Sample: 32 Feature: dti Number Unique: 6322 Sample: 17.05 Feature: delinquency_2years Number Unique: 30 Sample: 0.0 Feature: ficoRangeLow Number Unique: 39 Sample: 730.0 Feature: ficoRangeHigh Number Unique: 39 Sample: 734.0 Feature: openAcc Number Unique: 75 Sample: 7.0 Feature: pubRec Number Unique: 32 Sample: 0.0 Feature: pubRecBankruptcies Number Unique: 12 Sample: 0.0 Feature: revolBal Number Unique: 71116 Sample: 24178.0 Feature: revolUtil Number Unique: 1287 Sample: 48.9 Feature: totalAcc Number Unique: 134 Sample: 27.0 Feature: initialListStatus Number Unique: 2 Sample: 0 Feature: applicationType Number Unique: 2 Sample: 0 Feature: earliesCreditLine Number Unique: 720 Sample: Aug-2001 Feature: title Number Unique: 39645 Sample: 1.0 Feature: policyCode Number Unique: 1 Sample: 1.0 Feature: n0 Number Unique: 40 Sample: 0.0 Feature: n1 Number Unique: 34 Sample: 2.0 Feature: n2 Number Unique: 51 Sample: 2.0 Feature: n2.1 Number Unique: 51 Sample: 2.0 Feature: n4 Number Unique: 47 Sample: 4.0 Feature: n5 Number Unique: 66 Sample: 9.0 Feature: n6 Number Unique: 108 Sample: 8.0 Feature: n7 Number Unique: 71 Sample: 4.0 Feature: n8 Number Unique: 103 Sample: 12.0 Feature: n9 Number Unique: 45 Sample: 2.0 Feature: n10 Number Unique: 77 Sample: 7.0 Feature: n11 Number Unique: 6 Sample: 0.0 Feature: n12 Number Unique: 6 Sample: 0.0 Feature: n13 Number Unique: 29 Sample: 0.0 Feature: n14 Number Unique: 32 Sample: 2.0
# 不同数据类型
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
print("Numerical Features: ", len(numerical_fea))
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
print("Category Features: ", len(category_fea))
for fea in category_fea:
print("Feature: ", fea, "\t",
"Number Unique:", len(data_train[fea].unique()), "\t",
"Sample: ", data_train[fea].iloc[0])
Numerical Features: 42 Category Features: 5 Feature: grade Number Unique: 7 Sample: E Feature: subGrade Number Unique: 35 Sample: E2 Feature: employmentLength Number Unique: 12 Sample: 2 years Feature: issueDate Number Unique: 139 Sample: 2014-07-01 Feature: earliesCreditLine Number Unique: 720 Sample: Aug-2001
def is_float(x):
return bool(int(x) - x)
def all_unique_is_float(arr):
for i in arr:
if is_float(i):
return True
return False
# 离散/连续数据类型
# numerical_serial_fea = [fea for fea in numerical_fea if data_train[fea].nunique() > 10]
numerical_serial_fea = [fea for fea in numerical_fea if
all_unique_is_float(data_train[fea].dropna().unique())
or data_train[fea].nunique() > 50]
numerical_discrate_fea = [fea for fea in numerical_fea if fea not in numerical_serial_fea]
numerical_discrate_fea
['term', 'homeOwnership', 'verificationStatus', 'isDefault', 'purpose', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'pubRec', 'pubRecBankruptcies', 'initialListStatus', 'applicationType', 'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n9', 'n11', 'n12', 'n13', 'n14']
numerical_serial_fea
['id', 'loanAmnt', 'interestRate', 'installment', 'employmentTitle', 'annualIncome', 'postCode', 'regionCode', 'dti', 'openAcc', 'revolBal', 'revolUtil', 'totalAcc', 'title', 'n5', 'n6', 'n7', 'n8', 'n10']
# value counts
vc_features = features #category_fea + numerical_discrate_fea
rows = len(vc_features)
fig, axes = plt.subplots(nrows=rows, ncols=1, figsize=(12, 4*rows))
for i, fea in enumerate(vc_features):
# sns.barplot(
# x=list(data_train[fea].value_counts())[:10],
# y=data_train[fea].value_counts().keys()[:10],
# ax = axes[i]
# )
data_train[fea].value_counts().nlargest(10).plot(
kind='barh', ax=axes[i])
axes[i].set_title(fea)
# 看分布(离散/连续数据类型)进一步验证
features = category_fea + numerical_discrate_fea + numerical_serial_fea
df = data_train[features]
figsize = (18, 40)
cols = 4
rows = len(features) // cols + 1
def trim_axs(axs, N):
axs = axs.flat
for ax in axs[N:]:
ax.remove()
return axs[:N]
def plot_hist(fea_data, ax):
if fea_data.dtype == "object":
ax.hist(fea_data, bins=15)
else:
try:
sns.distplot(fea_data, ax=ax, axlabel=False)
except Exception as e:
ax.hist(fea_data, bins=15)
axs = plt.figure(figsize=figsize, constrained_layout=False).subplots(rows, cols)
axs = trim_axs(axs, len(features))
for ax, fea in zip(axs, features):
fea_data = df[fea].dropna()
ax.set_title(fea)
plot_hist(fea_data, ax)
类别数据:
数值数据:
id
loanAmnt 贷款金额
interestRate 贷款利率
installment 分期付款金额
employmentTitle 就业职称
annualIncome 年收入
postCode 借款人在贷款申请时的贷款用途类别
regionCode 地区编码
purpose 借款人在贷款申请时的贷款用途类别
dti 债务收入比
delinquency_2years 借款人过去2年信用档案中逾期30天以上的违约事件数
ficoRangeLow 借款人在贷款发放时的fico所属的下限范围
ficoRangeHigh 借款人在贷款发放时的fico所属的上限范围
openAcc 借款人信用档案中未结信用额度的数量
pubRec 贬损公共记录的数量
pubRecBankruptcies 公开记录清除的数量
revolBal 信贷周转余额合计
revolUtil 循环额度利用率,或借款人使用的相对于所有可用循环信贷的信贷金额
totalAcc 借款人信用档案中当前的信用额度总数
title 借款人提供的贷款名称
n 系列:n0 n1 n2 n2.1 n4 n5 n6 n7 n8 n9 n10 n13 n14
term 贷款期限
homeOwnership 借款人在登记时提供的房屋所有权状况
verificationStatus 验证状态
initialListStatus 贷款的初始列表状态
applicationType 表明贷款是个人申请还是与两个共同借款人的联合申请
n 系列:n11 n12
set(data_test["purpose"].dropna().to_list())
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}
# 偏态
skews = data_train[numerical_serial_fea].skew()
skews
id 1.586291e-17 loanAmnt 7.826221e-01 interestRate 7.123646e-01 installment 1.006328e+00 employmentTitle 1.379124e+00 annualIncome 4.618773e+01 purpose 1.251126e+00 postCode 8.321358e-01 regionCode 7.824390e-01 dti 2.730132e+01 delinquency_2years 5.685041e+00 ficoRangeLow 1.282404e+00 ficoRangeHigh 1.282608e+00 openAcc 1.298413e+00 pubRec 1.352173e+01 pubRecBankruptcies 3.440526e+00 revolBal 1.592468e+01 revolUtil -1.805942e-02 totalAcc 9.560535e-01 title 5.225696e+00 n0 5.106909e+00 n1 1.400085e+00 n2 1.542892e+00 n2.1 1.542892e+00 n4 1.701304e+00 n5 1.339371e+00 n6 2.063377e+00 n7 1.471781e+00 n8 1.303497e+00 n9 1.432905e+00 n10 1.296988e+00 n13 1.363116e+01 n14 1.486073e+00 dtype: float64
def addone(x):
return x + 1e-5
new_data
0 2.836151 1 3.326115 2 3.125444 3 2.845491 4 3.470724 ... 799995 2.946017 799996 2.754934 799997 2.494032 799998 3.375880 799999 2.196114 Name: dti, Length: 799759, dtype: float64
# 非正态的尝试 log 化
# sns.FacetGrid(df, size=2).map(sns.distplot, fea, ax=ax).add_legend()
skew_dict = skews.to_dict()
fig, axes = plt.subplots(nrows=len(skew_dict), ncols=2, figsize=(12, 3*len(skew_dict)))
for i, (key, val) in enumerate(skew_dict.items()):
fea_data = df[key].dropna()
plot_hist(fea_data, ax=axes[i, 0])
axes[i, 0].set_title(key +"/" + str(val), loc='right', pad=1.0)
new_data = fea_data.dropna().apply(addone).apply(np.log).dropna()
plot_hist(new_data, ax=axes[i, 1])
new_val = new_data.skew()
axes[i, 1].set_title(key +"/" + str(new_val), loc='right', pad=1.0)
# 不同特征值在不同 label 上的分布
fig, axes = plt.subplots(nrows=len(features), ncols=2, figsize=(12, 3*len(features)))
for i, fea in enumerate(features):
fea_data = data_train[data_train.isDefault == 1][fea].dropna()
plot_hist(fea_data, ax=axes[i, 0])
axes[i, 0].set_title(fea + " Label=1", loc='right', pad=1.0)
fea_data = data_train[data_train.isDefault == 0][fea].dropna()
plot_hist(fea_data, ax=axes[i, 1])
axes[i, 1].set_title(fea + " Label=0", loc='right', pad=1.0)
# category 编码
import category_encoders as ce
fea = "grade"
encoder1 = ce.OneHotEncoder(cols=[fea])
encoder2 = ce.OrdinalEncoder(cols=[fea])
encoder3 = ce.TargetEncoder(cols=[fea])
encoder4 = ce.CatBoostEncoder(cols=[fea])
tmp = data_train
X = tmp[fea]
y = tmp.isDefault
encoder3.fit(X, y)
encoder3.transform(X)
rows = len(features)
need_drop_feas = ["policyCode", "isDefault"]
fig, axes = plt.subplots(nrows=rows, ncols=1, figsize=(12, 4*rows))
g = sns.FacetGrid(data_train, hue="isDefault")
for i, fea in enumerate(features):
ax.set_title(fea)
if fea in category_fea + numerical_discrate_fea:
sns.countplot(fea, hue="isDefault", data=data_train, ax=axes[i])
else:
g = g.map(sns.distplot, fea, hist=False, rug=False, ax=axes[i])
df = data_train.drop(columns=['policyCode'])
df = df.dropna()
grade_dct = dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'], range(10, 80, 10)))
def deal_grade(x):
return grade_dct.get(x)
df["grade"] = df["grade"].apply(deal_grade)
def deal_sub_grade(x):
return grade_dct.get(x[0]) + int(x[1])
df["subGrade"] = df["subGrade"].apply(deal_sub_grade)
def deal_employmentLength(x):
if x == "10+ years":
return 10.0
elif x == "< 1 year":
return 0.5
else:
return float(x[0])
df["employmentLength"] = df["employmentLength"].apply(deal_employmentLength)
def deal_date(x):
return int(x.split("-")[1])
df["issueDate"] = df["issueDate"].apply(deal_date)
df["earliesCreditLine"] = df["earliesCreditLine"].apply(deal_date)
df.shape
(686195, 46)
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x3032761d0>