#!/usr/bin/env python # coding: utf-8 # # Hands-on: Study feature importance in the MAGIC Cherenkov telescope sample # In[ ]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # In[ ]: # read data filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/magic04_data.txt" df = pd.read_csv(filename, engine='python') # relabel: gamma shower (g) --> 1 (signal), hadron shower (h) --> 0 (background) df['class'] = df['class'].map({'g': 1, 'h': 0}) # In[ ]: # y = value to predict, X = features y = df['class'].values X = df[[col for col in df.columns if col!="class"]] # In[ ]: # generate training and test samples X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True) # In[ ]: # train the normal xgb classifier with all features and calculate the AUC import xgboost as xgb from sklearn.metrics import roc_auc_score XGBclassifier = xgb.sklearn.XGBClassifier(nthread=-1, seed=1, n_estimators=1000) # YOUR CODE HERE # #### Now try to train classifiers with $n-1$ features and recalculate the AUC # In[ ]: for feat in X.columns: # YOUR CODE HERE # #### How does this compare to the provided `plot_impartance` function from XGBoost? # (see [XGBoost plotting API](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.plotting)). Do you get the same answer for all three performance measures provided by XGBoost (“weight”, “gain”, or “cover”)? # In[ ]: # YOUR CODE HERE