Author： 马肖
E-Mail： maxiaoscut@aliyun.com
GitHub： https://github.com/Albertsr

In [1]:

import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.datasets import make_classification
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,
                            n_classes=2, n_clusters_per_class=3, random_state=2017)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [2]:

clf_gbdt = GradientBoostingClassifier(n_estimators=50)
clf_xgb = XGBClassifier(n_estimators=50)
clf_lgb = LGBMClassifier(n_estimators=50)
lr = LogisticRegression(max_iter=500, solver='lbfgs')

models = [clf_gbdt, clf_xgb, clf_lgb]
names = ['GBDT', 'XGBoost', 'LightGBM']

metric_scores = []
for model,name in zip(models, names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    fscore = f1_score(y_test, y_pred)

    if name == 'GBDT':
        X_train_leaves = model.apply(X_train)[:, :, 0]
        X_test_leaves = model.apply(X_test)[:, :, 0]
        
    elif name == 'LightGBM':
        X_train_leaves = model.predict(X_train, pred_leaf=True)
        X_test_leaves = model.predict(X_test, pred_leaf=True)
    else:
        X_train_leaves = model.apply(X_train)
        X_test_leaves = model.apply(X_test)

    
    All_leaves = np.r_[X_train_leaves, X_test_leaves]
    All_leaves = All_leaves.astype(np.int32)

    enc = OneHotEncoder(categories='auto')
    X_new_feat = enc.fit_transform(All_leaves)
    
    train_samples = X_train_leaves.shape[0]
    X_train_new = X_new_feat[:train_samples, :]
    X_test_new = X_new_feat[train_samples:, :]

    X_train_hstack = hstack([X_train_new, X_train])
    X_test_hstack = hstack([X_test_new, X_test])

    lr.fit(X_train_hstack, y_train)
    y_pred_2 = lr.predict(X_test_hstack)
    y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]

    new_acc = accuracy_score(y_test, y_pred_2)
    new_auc = roc_auc_score(y_test, y_pred_prob_2)
    new_fscore = f1_score(y_test, y_pred_2)
    score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}
    result = pd.DataFrame(score)
    metric_scores.append(result)

In [3]:

model_names = ['GBDT + LR', 'XGBoost + LR', 'LightGBM + LR']   
model_metrics = ['F1', 'ACC', 'AUC']
col_idx = pd.MultiIndex.from_product([model_names, model_metrics])
df_contrast = pd.concat(metric_scores, axis=0)
df_contrast.index = col_idx   
df_contrast

Out[3]:

		OriginalFeature	NewFeature
GBDT + LR	F1	0.841070	0.875536
	ACC	0.838400	0.872400
	AUC	0.925139	0.946116
XGBoost + LR	F1	0.837136	0.872116
	ACC	0.834400	0.869200
	AUC	0.921574	0.943909
LightGBM + LR	F1	0.910658	0.921269
	ACC	0.908800	0.919600
	AUC	0.969011	0.971790