#!/usr/bin/env python # coding: utf-8 # # Forest Cover Type Dataset # Dataset: http://archive.ics.uci.edu/ml/datasets/Covertype # ## Data Exploration # # We can see that there are 12 independent variables. # Out of them, 10 are quantitative variables. # The other 2 are categorical variables which are one-hot encoded. # There is a categorical dependent variable. # In[2]: # Initialization import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Data is in CSV format, but file doesn't have a header # So manually specify column names import numpy as np cont_names = [ "Elevation", "Aspect", "Slope", "R_Hydrology", "Z_Hydrology", "R_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "R_Fire_Points", ] # Continuous variables cont_dict = {name: np.float64 for name in cont_names} area_names = ['WArea_' + str(i + 1) for i in range(4)] area_dict = {name: np.int64 for name in area_names} soil_names = ['Soil_' + str(i + 1) for i in range(40)] soil_dict = {name: np.int64 for name in soil_names} cat_names = area_names + soil_names # Categorical variables target = 'Cover_Type' names = cont_names + cat_names # All column names except target dtypes_dict = {**cont_dict, **area_dict, **soil_dict} # In[4]: # Read data import os import pandas as pd data = pd.read_csv('covtype.data', header=None, names=names + [target], dtype=dtypes_dict) # In[5]: print(data.shape) # ### First look at continuous variables # In[6]: data.head()[cont_names] # In[7]: data.describe()[cont_names] # Note that there are no missing values. # Even the readme says so. # In[8]: # Class distribution print("percentage distribution of classes:") series = 100 * data['Cover_Type'].value_counts() / data['Cover_Type'].count() ax = data.Cover_Type.value_counts().plot(kind='pie') ax.set_aspect('equal') pd.DataFrame({'percentage': series}) # We have 7 classes out of which 2 classes form 85% of the data. There are 2 classes with less than 2% representation. Hence, we have heavily skewed classes and we need to take care about that while analysis. # ## Correlated variables # In[9]: # Find correlated variables X_cont = data[cont_names] corr_data = X_cont.corr() size = len(cont_names) corr_list = [] for i in range(size): for j in range(i+1, size): cc = corr_data.iloc[i,j] if abs(cc) > 0.5: corr_list.append((cc, i, j)) corr_list.sort(key=(lambda x: abs(x[0])), reverse=True) [(cc, cont_names[i], cont_names[j]) for cc, i, j in corr_list] # In[10]: g = sns.pairplot(data, hue='Cover_Type', size=3, x_vars=['Hillshade_9am'], y_vars=['Hillshade_3pm']) g.map_offdiag(plt.scatter, s=35, alpha=0.5) # In[11]: sns.pairplot(data, hue='Cover_Type', size=3, y_vars=['Hillshade_Noon'], x_vars=['Hillshade_9am', 'Hillshade_3pm']) # In[12]: pp = sns.pairplot(data.iloc[::20,:], hue='Slope', size=3, x_vars=['Aspect'], y_vars=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']) pp.fig.legends.clear() print('Slope increases from red to yellow to green to blue to pink') # In[13]: pp = sns.pairplot(data[data['Aspect'] <= 360].iloc[::200,:], hue='Aspect', size=3, x_vars=['Slope'], y_vars=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']) pp.fig.legends.clear() print('Aspect increases from red to yellow to green to blue to pink') # Since hillshade first increases and then decreases with slope, # hillshade is actually a measure of sunlight falling on an area (contrary to its name). # Let's find out the slope at which maximum sunlight is observed. # In[14]: series = data.loc[data['Hillshade_Noon'] == data['Hillshade_Noon'].max(), 'Slope'] slope_mean, slope_std = series.mean(), series.std() print("Slope with max sunlight: {}° ± {}°".format(slope_mean, slope_std)) # ```Slope where we get maximum sunlight = 20.35°. # Latitude = 40.63° (from readme and google). # So, the sun was off by 20.28° away from horizon, which means data was collected in summer. # Assuming noon sun angle offset varies sinusoidally with time of the year, # we get day of year as asin(a / 23.5) / pi * 365.25 / 2 days = 61 days # from Spring Equinox, i.e. March 20. # Therefore the estimated data collection date is May 20. # ``` # In[15]: sns.pairplot(data, hue='Cover_Type', size=5, x_vars=['R_Hydrology'], y_vars=['Z_Hydrology']) # ## Preprocessing # ### Feature creation # # By reading the dataset description, we can see that soils are also grouped by 'climatic zone' (referred to as `czone` from now on) and 'geologic zone' (referred to as `gzone` from now on). This data is not present in the dataset. We can add it ourselves. # In[18]: # One-cool binary columns (reverse of one-hot encoding procedure) data['soil_type'] = 0 for i in range(1, 41): data['soil_type'] += i * data['Soil_' + str(i)] data['warea'] = 0 for i in range(1, 5): data['warea'] += i * data['WArea_' + str(i)] data.head() # In[19]: # Get soil type data from readme.txt czone_to_soil = { 2: range(1, 7), 3: range(7, 9), 4: range(9, 14), 5: range(14, 16), 6: range(16, 19), 7: range(19, 35), 8: range(35, 41), } gzone_to_soil = { 1: [14, 15, 16, 17, 19, 20, 21], 2: [9, 22, 23], 5: range(7, 9), 7: list(range(1, 7)) + list(range(10, 14)) + [18] + list(range(24, 41)), } def invert_dict(d): """Takes a dict where values are iterables and returns an inverse mapping""" newdict = {} for k, vlist in d.items(): for v in vlist: newdict[v] = k return newdict soil_to_czone = invert_dict(czone_to_soil) soil_to_gzone = invert_dict(gzone_to_soil) # In[20]: # make columns for czone and gzone and one-hot encode them data['czone'] = data['soil_type'].map(soil_to_czone) data['gzone'] = data['soil_type'].map(soil_to_gzone) for soil_type in range(1, 41): czone_num = soil_to_czone[soil_type] data['czone' + str(czone_num)] = data['czone'].map(lambda x: 1 if x==czone_num else 0) for soil_type in range(1, 41): gzone_num = soil_to_gzone[soil_type] data['gzone' + str(gzone_num)] = data['gzone'].map(lambda x: 1 if x==gzone_num else 0) # In[21]: czone_names = ['czone' + str(c) for c in czone_to_soil.keys()] gzone_names = ['gzone' + str(g) for g in gzone_to_soil.keys()] cg_names = czone_names + gzone_names cat_names += cg_names names += cg_names # In[22]: # Write only cooled data to a separate CSV file if needed cool_data = data[list(data.columns[:10]) + ['warea', 'soil_type', 'czone', 'gzone', 'Cover_Type']] # cool_data.to_csv('cool_data.csv', index=False) # ## Classification # In[23]: # Prepare input and targets from collections import OrderedDict y = data[target] X = data[names] # Don't include created features X_no_fc = data[cont_names + area_names + soil_names] # Replace original soil columns by created features X_only_fc = data[cont_names + area_names + czone_names + gzone_names] # Don't include any soil data X_no_soil = data[cont_names + area_names] X_dict = OrderedDict([ ('X_no_fc', X_no_fc), ('X_only_fc', X_only_fc), ('X_no_soil', X_no_soil), ('X', X), ]) # In[24]: print("Null Accuracy:", y.value_counts().max() / y.count()) # In[44]: from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split from sklearn import metrics def my_tts_score(clf, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2, stratify=y) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) return metrics.accuracy_score(y_test, y_pred) def my_cross_val_score(clf, X, y): fold_generator = StratifiedKFold(n_splits=4, random_state=2, shuffle=True) return cross_val_score(clf, X, y, cv=fold_generator, n_jobs=-1) def print_cv_accuracy(clf, X, y): accs = my_cross_val_score(clf, X, y) print("{}-fold CV accuracy: {} % ± {} %".format(4, 100 * accs.mean(), 100 * accs.std())) # In[26]: # Decision Tree from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier() df = pd.DataFrame({}, columns=['mean', 'std', 'depth', 'node_count'], index=X_dict.keys()) for X_name, X_val in X_dict.items(): accs = my_cross_val_score(dtree, X_val, y) df.loc[X_name, 'mean'] = 100 * accs.mean() df.loc[X_name, 'std'] = 100 * accs.std() dtree = DecisionTreeClassifier() dtree.fit(X_val, y) df.loc[X_name, 'depth'] = dtree.tree_.max_depth df.loc[X_name, 'node_count'] = dtree.tree_.node_count print('CV Accuracy (in %) with DecisionTree:') df # We can see that adding new features doesn't help much in increasing the accuracy of the decision tree. # By default, scikit-learn makes completely unpruned trees. # Let's try pre-pruning. (Scikit doesn't yet support post-pruning). # # We will try making a decision tree for various gini thresholds # and plot a graph showing the variation. # This will help us find out the optimally pruned tree. # This tree will be pruned enough to make it general and avoid overfitting, # and it will not be too pruned to not have sufficient complexity. # In[27]: X = X_no_fc # In[28]: # Pre-pruning def plot_acc_vs_impurity(minimp, maximp, intervals=10): # Plots accuracy vs impurities for all impurities from minimp to maximp # and their (intervals - 1) arithmetic means in between. n_splits = 4 impurities = [minimp + (maximp - minimp) * x / intervals for x in range(intervals + 1)] maccs = [] for impurity in impurities: dtree = DecisionTreeClassifier(min_impurity_split=impurity) accs = my_cross_val_score(dtree, X, y) maccs.append(accs.mean()) plt.plot(impurities, maccs, 'b') # In[53]: plot_acc_vs_impurity(0, 0.1, 10) # In[33]: plot_acc_vs_impurity(0, 0.01, 50) # Since accuracy varies so stochastically with impurity threshold, increasing impurity threshold will probably not give significant benefit. # ### Feature importance # In[48]: # Find gini importance of features def print_importance_data(col_names, feature_importances): fi = sorted(zip(col_names, feature_importances), key=(lambda x: x[1]), reverse=True) for i, feature_importance in enumerate(fi[:15]): feature, importance = feature_importance print('{}. {}:\t{}'.format(i + 1, feature.ljust(15), importance)) print("Contribution of 15 most important features:", sum([x[1] for x in fi[:15]])) # In[30]: from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier() dtree.fit(X, y) print_importance_data(X.columns, dtree.feature_importances_) # In[31]: # Find accuracy when using 15 most important features essential_features = [ "Elevation", "R_Fire_Points", "R_Roadways", "R_Hydrology", "Z_Hydrology", "Hillshade_Noon", "Aspect", "Hillshade_9am", "Hillshade_3pm", "czone8", "czone2", "gzone7", "WArea_3", "Slope", "WArea_1", ] X_ess = data[essential_features] print_cv_accuracy(DecisionTreeClassifier(), X_ess, y) # Removing those features didn't increase accuracy. So they were probably not noise. # In[43]: # Ensemble Classifiers from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier clf_list = [RandomForestClassifier, AdaBoostClassifier] importances_list = [] clf_to_name = [(clf_type, clf_type.__name__[:-len('Classifier')]) for clf_type in clf_list] name_to_accs = {clf_name: [] for clf_type, clf_name in clf_to_name} ns = [10, 40] for n in ns: for clf_type, clf_name in clf_to_name: clf = clf_type(n_estimators=n) accs = my_tts_score(clf, X, y) name_to_accs[clf_name].append(100 * accs.mean()) clf.fit(X, y) importances_list.append(clf.feature_importances_) pd.DataFrame(name_to_accs, index=ns) # In[52]: print("RandomForest feature importances:") print_importance_data(X.columns, importances_list[0]) # In[ ]: