#!/usr/bin/env python
# coding: utf-8

# # Forest Cover Type Dataset

# Dataset: http://archive.ics.uci.edu/ml/datasets/Covertype

# ## Data Exploration
# 
# We can see that there are 12 independent variables.
# Out of them, 10 are quantitative variables.
# The other 2 are categorical variables which are one-hot encoded.
# There is a categorical dependent variable.

# In[2]:


# Initialization
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


# Data is in CSV format, but file doesn't have a header
# So manually specify column names

import numpy as np

cont_names = [
    "Elevation",
    "Aspect",
    "Slope",
    "R_Hydrology",
    "Z_Hydrology",
    "R_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "R_Fire_Points",
] # Continuous variables

cont_dict = {name: np.float64 for name in cont_names}
area_names = ['WArea_' + str(i + 1) for i in range(4)]
area_dict = {name: np.int64 for name in area_names}
soil_names = ['Soil_' + str(i + 1) for i in range(40)]
soil_dict = {name: np.int64 for name in soil_names}
cat_names = area_names + soil_names # Categorical variables

target = 'Cover_Type'

names = cont_names + cat_names # All column names except target
dtypes_dict = {**cont_dict, **area_dict, **soil_dict}


# In[4]:


# Read data

import os
import pandas as pd

data = pd.read_csv('covtype.data', header=None, names=names + [target], dtype=dtypes_dict)


# In[5]:


print(data.shape)


# ### First look at continuous variables

# In[6]:


data.head()[cont_names]


# In[7]:


data.describe()[cont_names]


# Note that there are no missing values.
# Even the readme says so.

# In[8]:


# Class distribution

print("percentage distribution of classes:")
series = 100 * data['Cover_Type'].value_counts() / data['Cover_Type'].count()
ax = data.Cover_Type.value_counts().plot(kind='pie')
ax.set_aspect('equal')

pd.DataFrame({'percentage': series})


# We have 7 classes out of which 2 classes form 85% of the data. There are 2 classes with less than 2% representation. Hence, we have heavily skewed classes and we need to take care about that while analysis.

# ## Correlated variables

# In[9]:


# Find correlated variables

X_cont = data[cont_names]
corr_data = X_cont.corr()
size = len(cont_names)
corr_list = []
for i in range(size):
    for j in range(i+1, size):
        cc = corr_data.iloc[i,j]
        if abs(cc) > 0.5:
            corr_list.append((cc, i, j))

corr_list.sort(key=(lambda x: abs(x[0])), reverse=True)
[(cc, cont_names[i], cont_names[j]) for cc, i, j in corr_list]


# In[10]:


g = sns.pairplot(data, hue='Cover_Type', size=3,
             x_vars=['Hillshade_9am'], y_vars=['Hillshade_3pm'])
g.map_offdiag(plt.scatter, s=35, alpha=0.5)


# In[11]:


sns.pairplot(data, hue='Cover_Type', size=3,
             y_vars=['Hillshade_Noon'], x_vars=['Hillshade_9am', 'Hillshade_3pm'])


# In[12]:


pp = sns.pairplot(data.iloc[::20,:], hue='Slope', size=3, x_vars=['Aspect'], 
             y_vars=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])
pp.fig.legends.clear()
print('Slope increases from red to yellow to green to blue to pink')


# In[13]:


pp = sns.pairplot(data[data['Aspect'] <= 360].iloc[::200,:],
            hue='Aspect', size=3, x_vars=['Slope'], 
            y_vars=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])
pp.fig.legends.clear()
print('Aspect increases from red to yellow to green to blue to pink')


# Since hillshade first increases and then decreases with slope,
# hillshade is actually a measure of sunlight falling on an area (contrary to its name).

# Let's find out the slope at which maximum sunlight is observed.

# In[14]:


series = data.loc[data['Hillshade_Noon'] == data['Hillshade_Noon'].max(), 'Slope']
slope_mean, slope_std = series.mean(), series.std()
print("Slope with max sunlight: {}° ± {}°".format(slope_mean, slope_std))


# ```Slope where we get maximum sunlight = 20.35°.
# Latitude = 40.63° (from readme and google).
# So, the sun was off by 20.28° away from horizon, which means data was collected in summer.
# Assuming noon sun angle offset varies sinusoidally with time of the year,
# we get day of year as asin(a / 23.5) / pi * 365.25 / 2 days = 61 days
# from Spring Equinox, i.e. March 20.
# Therefore the estimated data collection date is May 20.
# ```

# In[15]:


sns.pairplot(data, hue='Cover_Type', size=5, x_vars=['R_Hydrology'],
             y_vars=['Z_Hydrology'])


# ## Preprocessing

# ### Feature creation
# 
# By reading the dataset description, we can see that soils are also grouped by 'climatic zone' (referred to as `czone` from now on) and 'geologic zone' (referred to as `gzone` from now on). This data is not present in the dataset. We can add it ourselves.

# In[18]:


# One-cool binary columns (reverse of one-hot encoding procedure)

data['soil_type'] = 0
for i in range(1, 41):
    data['soil_type'] += i * data['Soil_' + str(i)]
data['warea'] = 0
for i in range(1, 5):
    data['warea'] += i * data['WArea_' + str(i)]
data.head()


# In[19]:


# Get soil type data from readme.txt

czone_to_soil = {
    2: range(1, 7),
    3: range(7, 9),
    4: range(9, 14),
    5: range(14, 16),
    6: range(16, 19),
    7: range(19, 35),
    8: range(35, 41),
}

gzone_to_soil = {
    1: [14, 15, 16, 17, 19, 20, 21],
    2: [9, 22, 23],
    5: range(7, 9),
    7: list(range(1, 7)) + list(range(10, 14)) + [18] + list(range(24, 41)),
}

def invert_dict(d):
    """Takes a dict where values are iterables and returns an inverse mapping"""
    newdict = {}
    for k, vlist in d.items():
        for v in vlist:
            newdict[v] = k
    return newdict

soil_to_czone = invert_dict(czone_to_soil)
soil_to_gzone = invert_dict(gzone_to_soil)


# In[20]:


# make columns for czone and gzone and one-hot encode them

data['czone'] = data['soil_type'].map(soil_to_czone)
data['gzone'] = data['soil_type'].map(soil_to_gzone)

for soil_type in range(1, 41):
    czone_num = soil_to_czone[soil_type]
    data['czone' + str(czone_num)] = data['czone'].map(lambda x: 1 if x==czone_num else 0)
for soil_type in range(1, 41):
    gzone_num = soil_to_gzone[soil_type]
    data['gzone' + str(gzone_num)] = data['gzone'].map(lambda x: 1 if x==gzone_num else 0)


# In[21]:


czone_names = ['czone' + str(c) for c in czone_to_soil.keys()]
gzone_names = ['gzone' + str(g) for g in gzone_to_soil.keys()]

cg_names = czone_names + gzone_names
cat_names += cg_names
names += cg_names


# In[22]:


# Write only cooled data to a separate CSV file if needed

cool_data = data[list(data.columns[:10]) + ['warea', 'soil_type', 'czone', 'gzone', 'Cover_Type']]
# cool_data.to_csv('cool_data.csv', index=False)


# ## Classification

# In[23]:


# Prepare input and targets

from collections import OrderedDict

y = data[target]
X = data[names]

# Don't include created features
X_no_fc = data[cont_names + area_names + soil_names]

# Replace original soil columns by created features
X_only_fc = data[cont_names + area_names + czone_names + gzone_names]

# Don't include any soil data
X_no_soil = data[cont_names + area_names]

X_dict = OrderedDict([
    ('X_no_fc', X_no_fc),
    ('X_only_fc', X_only_fc),
    ('X_no_soil', X_no_soil),
    ('X', X),
])


# In[24]:


print("Null Accuracy:", y.value_counts().max() / y.count())


# In[44]:


from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn import metrics

def my_tts_score(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
            test_size=0.25, random_state=2, stratify=y)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return metrics.accuracy_score(y_test, y_pred)

def my_cross_val_score(clf, X, y):
    fold_generator = StratifiedKFold(n_splits=4, random_state=2, shuffle=True)
    return cross_val_score(clf, X, y, cv=fold_generator, n_jobs=-1)

def print_cv_accuracy(clf, X, y):
    accs = my_cross_val_score(clf, X, y)
    print("{}-fold CV accuracy: {} % ± {} %".format(4,
        100 * accs.mean(), 100 * accs.std()))


# In[26]:


# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
df = pd.DataFrame({}, columns=['mean', 'std', 'depth', 'node_count'], index=X_dict.keys())

for X_name, X_val in X_dict.items():
    accs = my_cross_val_score(dtree, X_val, y)
    df.loc[X_name, 'mean'] = 100 * accs.mean()
    df.loc[X_name, 'std'] = 100 * accs.std()
    dtree = DecisionTreeClassifier()
    dtree.fit(X_val, y)
    df.loc[X_name, 'depth'] = dtree.tree_.max_depth
    df.loc[X_name, 'node_count'] = dtree.tree_.node_count
print('CV Accuracy (in %) with DecisionTree:')
df


# We can see that adding new features doesn't help much in increasing the accuracy of the decision tree.

# By default, scikit-learn makes completely unpruned trees.
# Let's try pre-pruning. (Scikit doesn't yet support post-pruning).
# 
# We will try making a decision tree for various gini thresholds
# and plot a graph showing the variation.
# This will help us find out the optimally pruned tree.
# This tree will be pruned enough to make it general and avoid overfitting,
# and it will not be too pruned to not have sufficient complexity.

# In[27]:


X = X_no_fc


# In[28]:


# Pre-pruning

def plot_acc_vs_impurity(minimp, maximp, intervals=10):
    # Plots accuracy vs impurities for all impurities from minimp to maximp
    # and their (intervals - 1) arithmetic means in between.
    n_splits = 4
    impurities = [minimp + (maximp - minimp) * x / intervals
                  for x in range(intervals + 1)]
    maccs = []
    for impurity in impurities:
        dtree = DecisionTreeClassifier(min_impurity_split=impurity)
        accs = my_cross_val_score(dtree, X, y)
        maccs.append(accs.mean())
    plt.plot(impurities, maccs, 'b')


# In[53]:


plot_acc_vs_impurity(0, 0.1, 10)


# In[33]:


plot_acc_vs_impurity(0, 0.01, 50)


# Since accuracy varies so stochastically with impurity threshold, increasing impurity threshold will probably not give significant benefit.

# ### Feature importance

# In[48]:


# Find gini importance of features

def print_importance_data(col_names, feature_importances):
    fi = sorted(zip(col_names, feature_importances),
        key=(lambda x: x[1]), reverse=True)
    for i, feature_importance in enumerate(fi[:15]):
        feature, importance = feature_importance
        print('{}. {}:\t{}'.format(i + 1, feature.ljust(15), importance))

    print("Contribution of 15 most important features:",
        sum([x[1] for x in fi[:15]]))


# In[30]:


from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(X, y)

print_importance_data(X.columns, dtree.feature_importances_)


# In[31]:


# Find accuracy when using 15 most important features

essential_features = [
    "Elevation",
    "R_Fire_Points",
    "R_Roadways",
    "R_Hydrology",
    "Z_Hydrology",
    "Hillshade_Noon",
    "Aspect",
    "Hillshade_9am",
    "Hillshade_3pm",
    "czone8",
    "czone2",
    "gzone7",
    "WArea_3",
    "Slope",
    "WArea_1",
]

X_ess = data[essential_features]

print_cv_accuracy(DecisionTreeClassifier(), X_ess, y)


# Removing those features didn't increase accuracy. So they were probably not noise.

# In[43]:


# Ensemble Classifiers

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

clf_list = [RandomForestClassifier, AdaBoostClassifier]
importances_list = []
clf_to_name = [(clf_type, clf_type.__name__[:-len('Classifier')])
             for clf_type in clf_list]
name_to_accs = {clf_name: [] for clf_type, clf_name in clf_to_name}
ns = [10, 40]

for n in ns:
    for clf_type, clf_name in clf_to_name:
        clf = clf_type(n_estimators=n)
        accs = my_tts_score(clf, X, y)
        name_to_accs[clf_name].append(100 * accs.mean())
        clf.fit(X, y)
        importances_list.append(clf.feature_importances_)

pd.DataFrame(name_to_accs, index=ns)


# In[52]:


print("RandomForest feature importances:")
print_importance_data(X.columns, importances_list[0])


# In[ ]: