In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
from optbinning import OptimalBinning
from typing import List, Optional, Union
import pnlp

import category_encoders as ce
warnings.filterwarnings('ignore')

在上一版的基础上,先固定模型不变,然后对特征工程做以下调整处理:

  • 删除缺失值:由于测试集也有缺失值,因此这个调整不做。
  • 类别(地区编码等)人工分桶:先不处理。
  • 删除异常值:先不处理。

  • 数值不分桶:前后对比 0.7191 VS 0.7208(v2.0)
  • n 系列的整数都按类别分桶:前后对比 0.7208 VS 0.7203(v3.0)
  • 地区数据不分桶:前后对比 0.7203 VS 0.7232(v4.0)
  • 数值特征按 unique 值是否大于 50 来区分:前后对比 0.7232 VS 0.xxxx(v5.0)

  • 数值特征不分桶,n系列按类别分桶,地区不分桶:v6.0
  • 数据特征不分桶,n系列作为数据特征,地区不分桶:v7.0

  • 年龄 < 1 从 0.5 调整为 0,对象 NA 从 "" 替换为 NA:v8.0
  • 年龄 < 1 保持 0.5,NA 从 ”“ 替换为 NA:v9.0

回到 7.0

  • 增加 title:v10.0
  • 需要分桶的类别数据转换:v11.0
  • grade 和 subGrade 增加到类别里:v12.0
  • 去掉 grade 和 subGrade 的数值类别:v13.0
  • 增加 obj_need_bucket_feas 分桶,即 分桶+转换:v14.0

回到 7.0

  • v15.0

特征交叉和特征选择也先不做。

In [4]:
# v5.0 lgb_score_mean: 0.7279104550696077
# v6.0 lgb_score_mean: 0.7292114750286548
# v7.0 lgb_score_mean: 0.7302781867038274
# v8.0 lgb_score_mean: 0.7298601320998748
# v9.0 lgb_score_mean: 0.729909166985008
# v10.0 lgb_score_mean: 0.7304943179820246
# v11.0 lgb_score_mean: 0.7311966915796223
# v12.0 lgb_score_mean: 0.7312080192382515
# v13.0 lgb_score_mean: 0.7309075610525948
# v14.0 lgb_score_mean: 0.7312571984016215
# v15.0 lgb_score_mean: 0.7302878558046096

# baseline: lgb_score_mean: 0.7312705586323757
In [5]:
train_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/train.csv"
test_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/testA.csv"

data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)
In [7]:
zero_one_feas = [
    "initialListStatus", "applicationType", 
]

num_not_bucket_feas = [
    "annualIncome",
    "term", "employmentLength", 
    "loanAmnt", "interestRate", "installment", "dti", 
    "delinquency_2years",
    "ficoRangeLow", "ficoRangeHigh", 
    "openAcc", 
    "pubRec", "pubRecBankruptcies",
    "revolBal", "revolUtil", "totalAcc",
    "earliesCreditLine", 
    "grade", "subGrade"
]

num_need_bucket_feas = [
]

obj_not_bucket_feas = [
    "homeOwnership", "verificationStatus", "n11", "n12",
]

obj_need_bucket_feas = [
    "employmentTitle", "postCode", "title",
    "n0", "n1", "n2", 
    'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n13', 'n14',
    "regionCode", "purpose"
]

na_ave_feas = ["dti", "revolUtil"]
na_ave_int_feas = ["employmentLength", "pubRecBankruptcies"]
In [9]:
def is_float(x):
    return bool(int(x) - x)

def all_data_is_float(df: pd.DataFrame, feature: str):
    uniq = df[feature].unique()
    for i in uniq:
        if pd.isna(i):
            continue
        if is_float(i):
            return True
    return False

def drop_given_features(df: pd.DataFrame, feature_list: List[str]) -> pd.DataFrame:
    return df.drop(columns=feature_list)

def drop_uniquevalue_features(df: pd.DataFrame) -> pd.DataFrame:
    need_drop_feas = [col for col in df.columns if df[col].nunique() <= 1]
    return df.drop(columns=need_drop_feas)

def convert_num_to_obj(x: Union[int, float]):
    if pd.isna(x):
        return x
    else:
        return str(int(x))

def convert_float_to_int(x: float):
    if pd.isna(x):
        return np.nan
    else:
        return int(x)

def deal_employmentLength(x):
    if pd.notna(x):
        if x == "10+ years":
            return 10
        elif x == "< 1 year":
            return 0
        else:
            return int(x[0])
    else:
        return np.nan
    
insignificant_feas = ["issueDate"]
grade_dct = dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'], range(10, 80, 10)))
In [10]:
def process_data(
    data: pd.DataFrame, 
    num_optbs: list = [], 
    obj_optbs: list = [], 
    training_data: bool = True):
    
    if training_data:
        num_optbs = []
        obj_optbs = []
    
    data = drop_uniquevalue_features(data)
    data = drop_given_features(data, insignificant_feas)

    data['grade'] = data['grade'].map(grade_dct)
    data["subGrade"] = data["subGrade"].apply(lambda x: grade_dct.get(x[0]) + int(x[1]))
    data["employmentLength"] = data["employmentLength"].apply(deal_employmentLength)
    data["earliesCreditLine"] = data["earliesCreditLine"].apply(lambda x: int(x[-4:]))
    
    # 转换
    for i, fea in enumerate(obj_need_bucket_feas):
        data[fea+'_cnts'] = data.groupby([fea])['id'].transform('count')
        data[fea+'_rank'] = data.groupby([fea])['id'].rank(ascending=False).astype(int)
        data = data.drop(columns=[fea])
    
    # 分桶:数值特征
#     for i, fea in enumerate(num_need_bucket_feas):
#         if training_data:
#             optb = OptimalBinning(name=fea, dtype="numerical", solver="cp")
#             optb.fit(data[fea], data["isDefault"])
#             num_optbs.append(optb)
#         else:
#             optb = num_optbs[i]
#         data[fea] = optb.transform(data[fea])

    # 分桶:类别特征
#     for i, fea in enumerate(obj_need_bucket_feas):
#         if fea.endswith("cnts"):
#             fea = fea + "_cnts"
#         elif fea.endswith("rank"):
#             fea = fea + "_rank"
            
#         if training_data:
#             try:
#                 optb = OptimalBinning(name=fea, dtype="categorical", solver="mip", cat_cutoff=0.1)
#                 optb.fit(data[fea], data["isDefault"])
#                 obj_optbs.append(optb)
#             except Exception as err:
#                 print(fea, data[fea].nunique())
#                 continue
#         else:
#             optb = obj_optbs[i]
#         data[fea] = optb.transform(data[fea])

    data[na_ave_feas] = data[na_ave_feas].fillna(data[na_ave_feas].mean())
    data[na_ave_int_feas] = data[na_ave_int_feas].fillna(data[na_ave_int_feas].mean().apply(int))
    for fea in obj_need_bucket_feas:
        fea1 = fea + "_cnts"
        fea2 = fea + "_rank"
        data[fea1] = data[fea1].fillna(data[fea1].mean())
        data[fea2] = data[fea1].fillna(data[fea2].mean())
    
    data = pd.get_dummies(data, columns=obj_not_bucket_feas, drop_first=True)
    
    if training_data:
        return data, num_optbs, obj_optbs
    else:
        return data
In [11]:
data_train, num_optbs, obj_optbs = process_data(data_train)
In [15]:
data_test = process_data(data_test, num_optbs, obj_optbs, False)
In [16]:
features = list(set(data_train.columns) & set(data_test.columns))
features = [fea for fea in features if fea not in ["isDefault"]]
In [19]:
x_train = data_train[features]
x_test = data_test[features]
y_train = data_train['isDefault']
In [20]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test
In [21]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
In [22]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742068	valid_1's auc: 0.72961
[400]	training's auc: 0.75501	valid_1's auc: 0.730788
[600]	training's auc: 0.766032	valid_1's auc: 0.730912
[800]	training's auc: 0.776082	valid_1's auc: 0.730865
Early stopping, best iteration is:
[654]	training's auc: 0.768859	valid_1's auc: 0.731115
[0.7311148422768661]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.743075	valid_1's auc: 0.725659
[400]	training's auc: 0.755906	valid_1's auc: 0.726767
[600]	training's auc: 0.766757	valid_1's auc: 0.727227
[800]	training's auc: 0.77723	valid_1's auc: 0.727332
Early stopping, best iteration is:
[710]	training's auc: 0.772607	valid_1's auc: 0.727422
[0.7311148422768661, 0.727422124643655]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.741881	valid_1's auc: 0.730163
[400]	training's auc: 0.75455	valid_1's auc: 0.731362
[600]	training's auc: 0.765952	valid_1's auc: 0.731753
Early stopping, best iteration is:
[524]	training's auc: 0.761919	valid_1's auc: 0.731801
[0.7311148422768661, 0.727422124643655, 0.7318008859742777]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742291	valid_1's auc: 0.729135
[400]	training's auc: 0.754829	valid_1's auc: 0.730007
[600]	training's auc: 0.766256	valid_1's auc: 0.730673
[800]	training's auc: 0.776744	valid_1's auc: 0.730657
Early stopping, best iteration is:
[623]	training's auc: 0.767645	valid_1's auc: 0.730747
[0.7311148422768661, 0.727422124643655, 0.7318008859742777, 0.7307473514063647]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742671	valid_1's auc: 0.728153
[400]	training's auc: 0.755277	valid_1's auc: 0.729347
[600]	training's auc: 0.766272	valid_1's auc: 0.729526
Early stopping, best iteration is:
[535]	training's auc: 0.762675	valid_1's auc: 0.729568
[0.7311148422768661, 0.727422124643655, 0.7318008859742777, 0.7307473514063647, 0.7295684174238821]
lgb_scotrainre_list: [0.7311148422768661, 0.727422124643655, 0.7318008859742777, 0.7307473514063647, 0.7295684174238821]
lgb_score_mean: 0.730130724345009
lgb_score_std: 0.001535633961606915
In [ ]:
 
In [265]:
test_out_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/testA_result_V15.0.csv"

rh_test = lgb_test#*0.5 + xgb_test*0.5
data_test['isDefault'] = rh_test
data_test[['id','isDefault']].to_csv(test_out_file, index=False)
In [ ]:
 
In [ ]:
 
In [ ]: