clf_gbdt = GradientBoostingClassifier(n_estimators=50)
clf_xgb = XGBClassifier(n_estimators=50)
clf_lgb = LGBMClassifier(n_estimators=50)
lr = LogisticRegression(max_iter=500, solver='lbfgs')
models = [clf_gbdt, clf_xgb, clf_lgb]
names = ['GBDT', 'XGBoost', 'LightGBM']
metric_scores = []
for model,name in zip(models, names):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_prob)
fscore = f1_score(y_test, y_pred)
if name == 'GBDT':
X_train_leaves = model.apply(X_train)[:, :, 0]
X_test_leaves = model.apply(X_test)[:, :, 0]
elif name == 'LightGBM':
X_train_leaves = model.predict(X_train, pred_leaf=True)
X_test_leaves = model.predict(X_test, pred_leaf=True)
else:
X_train_leaves = model.apply(X_train)
X_test_leaves = model.apply(X_test)
All_leaves = np.r_[X_train_leaves, X_test_leaves]
All_leaves = All_leaves.astype(np.int32)
enc = OneHotEncoder(categories='auto')
X_new_feat = enc.fit_transform(All_leaves)
train_samples = X_train_leaves.shape[0]
X_train_new = X_new_feat[:train_samples, :]
X_test_new = X_new_feat[train_samples:, :]
X_train_hstack = hstack([X_train_new, X_train])
X_test_hstack = hstack([X_test_new, X_test])
lr.fit(X_train_hstack, y_train)
y_pred_2 = lr.predict(X_test_hstack)
y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]
new_acc = accuracy_score(y_test, y_pred_2)
new_auc = roc_auc_score(y_test, y_pred_prob_2)
new_fscore = f1_score(y_test, y_pred_2)
score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}
result = pd.DataFrame(score)
metric_scores.append(result)