#!/usr/bin/env python # coding: utf-8 # In[1]: import os, subprocess, pickle import pandas as pd import numpy as np from IPython.display import display # from lib_feature_engineering import * # # Combine features # In[2]: # check features folders subprocess.check_output(["ls", "features"]).splitlines() # In[10]: # specified features set for joining ls_feat_file = [ 'baseline.pkl.bz2', 'baseline_extend.pkl.bz2', 'bureau_balance_1year.pkl.bz2', 'bureau_balance_2year.pkl.bz2', 'bureau_balance_gt3year.pkl.bz2', 'bureau_balance_lt1year.pkl.bz2', 'bureau_balance.pkl.bz2', 'bureau.pkl.bz2', 'credit_card_balance.pkl.bz2', 'installments_payments_gt3year.pkl.bz2', 'installments_payments_in1year.pkl.bz2', 'installments_payments_in2year.pkl.bz2', 'installments_payments.pkl.bz2', 'pos_cash_gt3year.pkl.bz2', 'pos_cash_in1year.pkl.bz2', 'pos_cash_in2year.pkl.bz2', 'pos_cash.pkl.bz2', 'prev_app.pkl.bz2' ] # In[11]: get_ipython().run_cell_magic('time', '', '# use first features for base joined\nfeat_path = os.path.join("features", ls_feat_file[0])\npdf_combined = pd.read_pickle(feat_path, compression="bz2")\n\n# join next features set\nfor fname in ls_feat_file[1:]:\n feat_path = os.path.join("features", fname)\n pdf_feat = pd.read_pickle(feat_path, compression="bz2")\n print(fname, pdf_feat.shape)\n \n # add table prefix\n tbl_prefix = fname.split(".")[0]\n rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}\n pdf_feat.rename(columns=rename_col, inplace=True)\n \n # join\n pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")\n\nprint("rows, columns", pdf_combined.shape)\nls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]\ndisplay(pdf_combined.head())\n') # In[ ]: get_ipython().run_cell_magic('time', '', 'if False:\n def filter_feat_low_auc(pdf_label, pdf_input, threshold=0.501):\n pdf_eval = feature_evaluate(pdf_label, pdf_input)\n ls_filtered_feat = pdf_eval.query("auc > {}".format(threshold))["name"].tolist()\n return ls_filtered_feat\n \n \n # load train data\n data_path = "home-credit-default-risk/application_train.csv"\n pdf_train = pd.read_csv(data_path)\n\n # filter by tvt code\n pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")\n pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == \'train\'")\n .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")\n .drop(columns=["tvt_code"]))\n \n ls_filtered_feat = filter_feat_low_auc(pdf_train_filtered, pdf_combined, threshold=0.501)\n pdf_combined = pdf_combined[["SK_ID_CURR"] + ls_filtered_feat]\n print("After filtered: {}".format(pdf_combined.shape))\n') # # join with label # In[12]: pdf_tvt = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2") print(pdf_tvt.shape) display(pdf_tvt.head()) # In[13]: pdf_tvt["tvt_code"].value_counts() # In[14]: pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left") print(pdf_features_label.shape) display(pdf_features_label.head().T) # In[15]: get_ipython().run_cell_magic('time', '', '# save combined features with label\n# pdf_features_label.to_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2")\npdf_features_label.to_csv(os.path.join("features", "pdf_features_label.csv.bz2"), compression="bz2")\n')