import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
from optbinning import OptimalBinning
from typing import List, Optional, Union
import pnlp
import category_encoders as ce
warnings.filterwarnings('ignore')
在上一版的基础上,先固定模型不变,然后对特征工程做以下调整处理:
回到 7.0
回到 7.0
特征交叉和特征选择也先不做。
# v5.0 lgb_score_mean: 0.7279104550696077
# v6.0 lgb_score_mean: 0.7292114750286548
# v7.0 lgb_score_mean: 0.7302781867038274
# v8.0 lgb_score_mean: 0.7298601320998748
# v9.0 lgb_score_mean: 0.729909166985008
# v10.0 lgb_score_mean: 0.7304943179820246
# v11.0 lgb_score_mean: 0.7311966915796223
# v12.0 lgb_score_mean: 0.7312080192382515
# v13.0 lgb_score_mean: 0.7309075610525948
# v14.0 lgb_score_mean: 0.7312571984016215
# v15.0 lgb_score_mean: 0.7302878558046096
# baseline: lgb_score_mean: 0.7312705586323757
train_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/train.csv"
test_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/testA.csv"
data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)
zero_one_feas = [
"initialListStatus", "applicationType",
]
num_not_bucket_feas = [
"annualIncome",
"term", "employmentLength",
"loanAmnt", "interestRate", "installment", "dti",
"delinquency_2years",
"ficoRangeLow", "ficoRangeHigh",
"openAcc",
"pubRec", "pubRecBankruptcies",
"revolBal", "revolUtil", "totalAcc",
"earliesCreditLine",
"grade", "subGrade"
]
num_need_bucket_feas = [
]
obj_not_bucket_feas = [
"homeOwnership", "verificationStatus", "n11", "n12",
]
obj_need_bucket_feas = [
"employmentTitle", "postCode", "title",
"n0", "n1", "n2",
'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n13', 'n14',
"regionCode", "purpose"
]
na_ave_feas = ["dti", "revolUtil"]
na_ave_int_feas = ["employmentLength", "pubRecBankruptcies"]
def is_float(x):
return bool(int(x) - x)
def all_data_is_float(df: pd.DataFrame, feature: str):
uniq = df[feature].unique()
for i in uniq:
if pd.isna(i):
continue
if is_float(i):
return True
return False
def drop_given_features(df: pd.DataFrame, feature_list: List[str]) -> pd.DataFrame:
return df.drop(columns=feature_list)
def drop_uniquevalue_features(df: pd.DataFrame) -> pd.DataFrame:
need_drop_feas = [col for col in df.columns if df[col].nunique() <= 1]
return df.drop(columns=need_drop_feas)
def convert_num_to_obj(x: Union[int, float]):
if pd.isna(x):
return x
else:
return str(int(x))
def convert_float_to_int(x: float):
if pd.isna(x):
return np.nan
else:
return int(x)
def deal_employmentLength(x):
if pd.notna(x):
if x == "10+ years":
return 10
elif x == "< 1 year":
return 0
else:
return int(x[0])
else:
return np.nan
insignificant_feas = ["issueDate"]
grade_dct = dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'], range(10, 80, 10)))
def process_data(
data: pd.DataFrame,
num_optbs: list = [],
obj_optbs: list = [],
training_data: bool = True):
if training_data:
num_optbs = []
obj_optbs = []
data = drop_uniquevalue_features(data)
data = drop_given_features(data, insignificant_feas)
data['grade'] = data['grade'].map(grade_dct)
data["subGrade"] = data["subGrade"].apply(lambda x: grade_dct.get(x[0]) + int(x[1]))
data["employmentLength"] = data["employmentLength"].apply(deal_employmentLength)
data["earliesCreditLine"] = data["earliesCreditLine"].apply(lambda x: int(x[-4:]))
# 转换
for i, fea in enumerate(obj_need_bucket_feas):
data[fea+'_cnts'] = data.groupby([fea])['id'].transform('count')
data[fea+'_rank'] = data.groupby([fea])['id'].rank(ascending=False).astype(int)
data = data.drop(columns=[fea])
# 分桶:数值特征
# for i, fea in enumerate(num_need_bucket_feas):
# if training_data:
# optb = OptimalBinning(name=fea, dtype="numerical", solver="cp")
# optb.fit(data[fea], data["isDefault"])
# num_optbs.append(optb)
# else:
# optb = num_optbs[i]
# data[fea] = optb.transform(data[fea])
# 分桶:类别特征
# for i, fea in enumerate(obj_need_bucket_feas):
# if fea.endswith("cnts"):
# fea = fea + "_cnts"
# elif fea.endswith("rank"):
# fea = fea + "_rank"
# if training_data:
# try:
# optb = OptimalBinning(name=fea, dtype="categorical", solver="mip", cat_cutoff=0.1)
# optb.fit(data[fea], data["isDefault"])
# obj_optbs.append(optb)
# except Exception as err:
# print(fea, data[fea].nunique())
# continue
# else:
# optb = obj_optbs[i]
# data[fea] = optb.transform(data[fea])
data[na_ave_feas] = data[na_ave_feas].fillna(data[na_ave_feas].mean())
data[na_ave_int_feas] = data[na_ave_int_feas].fillna(data[na_ave_int_feas].mean().apply(int))
for fea in obj_need_bucket_feas:
fea1 = fea + "_cnts"
fea2 = fea + "_rank"
data[fea1] = data[fea1].fillna(data[fea1].mean())
data[fea2] = data[fea1].fillna(data[fea2].mean())
data = pd.get_dummies(data, columns=obj_not_bucket_feas, drop_first=True)
if training_data:
return data, num_optbs, obj_optbs
else:
return data
data_train, num_optbs, obj_optbs = process_data(data_train)
data_test = process_data(data_test, num_optbs, obj_optbs, False)
features = list(set(data_train.columns) & set(data_test.columns))
features = [fea for fea in features if fea not in ["isDefault"]]
x_train = data_train[features]
x_test = data_test[features]
y_train = data_train['isDefault']
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2020,
'nthread': 28,
'n_jobs':24,
'silent': True,
'verbose': -1,
}
model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
# print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
if clf_name == "xgb":
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': 2020,
'nthread': 36,
"silent": True,
}
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
if clf_name == "cat":
params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=20000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
train[valid_index] = val_pred
test = test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************ [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.742068 valid_1's auc: 0.72961 [400] training's auc: 0.75501 valid_1's auc: 0.730788 [600] training's auc: 0.766032 valid_1's auc: 0.730912 [800] training's auc: 0.776082 valid_1's auc: 0.730865 Early stopping, best iteration is: [654] training's auc: 0.768859 valid_1's auc: 0.731115 [0.7311148422768661] ************************************ 2 ************************************ [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.743075 valid_1's auc: 0.725659 [400] training's auc: 0.755906 valid_1's auc: 0.726767 [600] training's auc: 0.766757 valid_1's auc: 0.727227 [800] training's auc: 0.77723 valid_1's auc: 0.727332 Early stopping, best iteration is: [710] training's auc: 0.772607 valid_1's auc: 0.727422 [0.7311148422768661, 0.727422124643655] ************************************ 3 ************************************ [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.741881 valid_1's auc: 0.730163 [400] training's auc: 0.75455 valid_1's auc: 0.731362 [600] training's auc: 0.765952 valid_1's auc: 0.731753 Early stopping, best iteration is: [524] training's auc: 0.761919 valid_1's auc: 0.731801 [0.7311148422768661, 0.727422124643655, 0.7318008859742777] ************************************ 4 ************************************ [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.742291 valid_1's auc: 0.729135 [400] training's auc: 0.754829 valid_1's auc: 0.730007 [600] training's auc: 0.766256 valid_1's auc: 0.730673 [800] training's auc: 0.776744 valid_1's auc: 0.730657 Early stopping, best iteration is: [623] training's auc: 0.767645 valid_1's auc: 0.730747 [0.7311148422768661, 0.727422124643655, 0.7318008859742777, 0.7307473514063647] ************************************ 5 ************************************ [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.742671 valid_1's auc: 0.728153 [400] training's auc: 0.755277 valid_1's auc: 0.729347 [600] training's auc: 0.766272 valid_1's auc: 0.729526 Early stopping, best iteration is: [535] training's auc: 0.762675 valid_1's auc: 0.729568 [0.7311148422768661, 0.727422124643655, 0.7318008859742777, 0.7307473514063647, 0.7295684174238821] lgb_scotrainre_list: [0.7311148422768661, 0.727422124643655, 0.7318008859742777, 0.7307473514063647, 0.7295684174238821] lgb_score_mean: 0.730130724345009 lgb_score_std: 0.001535633961606915
test_out_file = "/Users/HaoShaochun/Yam/FinancialRiskControl/data/testA_result_V15.0.csv"
rh_test = lgb_test#*0.5 + xgb_test*0.5
data_test['isDefault'] = rh_test
data_test[['id','isDefault']].to_csv(test_out_file, index=False)