#!/usr/bin/env python # coding: utf-8 # In[1]: import os, pickle import pandas as pd import numpy as np # import matplotlib.pyplot as plt from IPython.display import display # from sklearn import metrics from sklearn.model_selection import train_test_split # import xgboost as xgb from xgboost import plot_importance # some settings for displaying Pandas results pd.set_option('display.width', 2000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.precision', 4) pd.set_option('display.max_colwidth', -1) # # Load combined features with label # In[2]: # pdf_features_label = pd.read_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2") pdf_features_label = pd.read_csv(os.path.join("../04_feature_engineering/features", "pdf_features_label.csv.bz2"), compression="bz2") meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"] ls_features = [cname for cname in pdf_features_label.columns if cname not in meta_cols] # print("Number of features: {}".format(len(ls_features))) print(pdf_features_label.shape) display(pdf_features_label.head().T) # In[3]: pdf_features_label["tvt_code"].value_counts() # # Modeling # In[4]: version = "v07" # In[5]: def get_Xy_from_pdf(pdf_input, ls_features, tvt_code): pdf_data = pdf_input[pdf_input["tvt_code"] == tvt_code].copy() # X = pdf_data[ls_features] y = pdf_data["TARGET"] return (X, y) # X_train, y_train = get_Xy_from_pdf(pdf_features_label, ls_features, "train") X_val, y_val = get_Xy_from_pdf(pdf_features_label, ls_features, "val") X_test, y_test = get_Xy_from_pdf(pdf_features_label, ls_features, "test") # In[6]: get_ipython().run_cell_magic('time', '', 'param_init = {\n "objective": "binary:logistic", \n "booster": "gbtree", \n "max_depth": 4, # default: 3 only for depthwise\n "n_estimators": 1000, # default: 500 \n "learning_rate": 0.025, # default: 0.05 \n "subsample": 0.7, \n "colsample_bytree": 0.6, # default: 1.0\n "colsample_bylevel": 0.5, # default: 1.0\n "random_state": 0,\n \n #\n "silent": True, \n "n_jobs": 16, \n \n #\n "tree_method": "hist", # default: auto\n "grow_policy": "lossguide", # default depthwise\n}\n\nparam_fit = {\n "eval_metric": "auc", \n "early_stopping_rounds": 500, # default: 100\n "verbose": 200,\n "eval_set": [(X_train, y_train), (X_val, y_val), (X_test, y_test)]\n}\n\nxgb_model = xgb.XGBClassifier(**param_init)\nxgb_model.fit(X_train, y_train, **param_fit)\nevals_result = xgb_model.evals_result()\nevals_result\n') # In[7]: # save model to file res_model = { "xgb_model": xgb_model, "features": ls_features } pickle.dump(res_model, open("models/xgb_model_baseline_{}.mod".format(version), "wb")) # # Model evaluates # In[8]: # read model with open("models/xgb_model_baseline_{}.mod".format(version), "rb") as input_file: res_model = pickle.load(input_file) res_model.keys() # In[9]: def visualize_auc(pdf, tvt_code, res_model): # get Xy and predict X, y = get_Xy_from_pdf(pdf, res_model["features"], tvt_code) y_pred = res_model["xgb_model"].predict_proba(X)[:, 1] # get values auc_value = metrics.roc_auc_score(y, y_pred) res01 = metrics.roc_curve(y, y_pred) # plot figure, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,3)) lw = 2 ax1.plot(res01[0], res01[1], color="darkorange",lw=lw, label="ROC") ax1.plot([0, 1], [0, 1], color="navy", label="Random", lw=lw, linestyle="--") ax1.set_xlim([0.0, 1.0]) ax1.set_ylim([0.0, 1.05]) ax1.set_xlabel("False Positive Rate") ax1.set_ylabel("True Positive Rate") ax1.set_title("AUC = %0.5f"%(auc_value)) ax1.legend(loc="lower right") # prediction probability histogram ax2.set_title("{} set (size: {})".format(tvt_code, y.shape[0])) ax2.hist(y_pred, bins=200) plt.show() # visualize_auc(pdf_features_label, "test", res_model) # In[10]: # fig_height = len(res_model["features"]) / 4 fig, ax = plt.subplots(figsize=(10, fig_height)) plot_importance(res_model["xgb_model"], ax=ax) plt.show() # # Save submission # In[11]: X_kaggle_test = pdf_features_label.query("tvt_code == 'kaggle_test'")[ls_features] y_test_pred = xgb_model.predict_proba(X_kaggle_test)[:, 1] y_test_pred.mean() # In[12]: SK_IDs = pdf_features_label.query("tvt_code == 'kaggle_test'")["SK_ID_CURR"].tolist() pdf_submiss = pd.DataFrame({"SK_ID_CURR": SK_IDs, "TARGET": y_test_pred}) pdf_submiss.to_csv("submissions/submission_baseline_{}.csv".format(version), index=False) pdf_submiss.head() # ![submission_baseline_v02](submissions/submission_baseline_v02.png "submission_baseline_v02") # ![submission_baseline_v01](submissions/submission_baseline_v01.png "submission_baseline_v01")