#!/usr/bin/env python
# coding: utf-8

# ## Kaggle's Predicting Red Hat Business Value
# 
# This is a first quick & dirty attempt at Kaggle's [Predicting Red Hat Business Value](https://www.kaggle.com/c/predicting-red-hat-business-value) competition.
# 
# ### Loading in the data

# In[1]:


import pandas as pd

people = pd.read_csv('people.csv.zip')
people.head(3)


# In[2]:


actions = pd.read_csv('act_train.csv.zip')
actions.head(3)


# ## Joining together to get dataset

# In[3]:


training_data_full = pd.merge(actions, people, how='inner', on='people_id', suffixes=['_action', '_person'], sort=False)
training_data_full.head(5)


# In[4]:


(actions.shape, people.shape, training_data_full.shape)


# ## Building a preprocessing pipeline

# In[5]:


# %load "preprocessing_transforms.py"
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd


class BaseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, **transform_params):
        return self


class ColumnSelector(BaseTransformer):
    """Selects columns from Pandas Dataframe"""

    def __init__(self, columns, c_type=None):
        self.columns = columns
        self.c_type = c_type

    def transform(self, X, **transform_params):
        cs = X[self.columns]
        if self.c_type is None:
            return cs
        else:
            return cs.astype(self.c_type)


class SpreadBinary(BaseTransformer):

    def transform(self, X, **transform_params):
        return X.applymap(lambda x: 1 if x == 1 else -1)


class DfTransformerAdapter(BaseTransformer):
    """Adapts a scikit-learn Transformer to return a pandas DataFrame"""

    def __init__(self, transformer):
        self.transformer = transformer

    def fit(self, X, y=None, **fit_params):
        self.transformer.fit(X, y=y, **fit_params)
        return self

    def transform(self, X, **transform_params):
        raw_result = self.transformer.transform(X, **transform_params)
        return pd.DataFrame(raw_result, columns=X.columns, index=X.index)


class DfOneHot(BaseTransformer):
    """
    Wraps helper method `get_dummies` making sure all columns get one-hot encoded.
    """
    def __init__(self):
        self.dummy_columns = []

    def fit(self, X, y=None, **fit_params):
        self.dummy_columns = pd.get_dummies(
            X,
            prefix=[c for c in X.columns],
            columns=X.columns).columns
        return self

    def transform(self, X, **transform_params):
        return pd.get_dummies(
            X,
            prefix=[c for c in X.columns],
            columns=X.columns).reindex(columns=self.dummy_columns, fill_value=0)


class DfFeatureUnion(BaseTransformer):
    """A dataframe friendly implementation of `FeatureUnion`"""

    def __init__(self, transformers):
        self.transformers = transformers

    def fit(self, X, y=None, **fit_params):
        for l, t in self.transformers:
            t.fit(X, y=y, **fit_params)
        return self

    def transform(self, X, **transform_params):
        transform_results = [t.transform(X, **transform_params) for l, t in self.transformers]
        return pd.concat(transform_results, axis=1)


# In[6]:


training_data_full.columns


# In[7]:


for col in training_data_full.columns:
    print("in {} there are {} unique values".format(col, len(training_data_full[col].unique())))
None


# ### Potential trouble with high dimensionality
# 
# Notice that char_10_action, group_1 and others have a ton of unique values; one-hot encoding will result in a dataframe with thousands of columns. 
# 
# Being lazy and getting as fast as possible to a first attempt, let's skip those and only consider categorical variable with ~20 or less unique values. We'll get smarter about dealing with these variables to reinclude them in our model on a subsequent attempt

# In[8]:


from sklearn.pipeline import Pipeline

from sklearn.preprocessing import Imputer, StandardScaler

cat_columns = ['activity_category',
       'char_1_action', 'char_2_action', 'char_3_action', 'char_4_action',
       'char_5_action', 'char_6_action', 'char_7_action', 'char_8_action',
       'char_9_action', 'char_1_person',
       'char_2_person', 'char_3_person',
       'char_4_person', 'char_5_person', 'char_6_person', 'char_7_person',
       'char_8_person', 'char_9_person', 'char_10_person', 'char_11',
       'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17',
       'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23',
       'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29',
       'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35',
       'char_36', 'char_37']

q_columns = ['char_38']

preprocessor = Pipeline([
    ('features', DfFeatureUnion([
        ('quantitative', Pipeline([
            ('select-quantitative', ColumnSelector(q_columns, c_type='float')),
            ('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))),
            ('scale', DfTransformerAdapter(StandardScaler()))
        ])),
        ('categorical', Pipeline([
            ('select-categorical', ColumnSelector(cat_columns)),
            ('apply-onehot', DfOneHot()),
            ('spread-binary', SpreadBinary())
        ])),
    ]))
])


# ### Sampling to reduce runtime in training large dataset
# 
# If we train models based on the entire test dataset provided it exhausts the memory on my laptop. Again, in the spirit of getting something quick and dirty working, we'll sample the dataset and train on that. We'll then evaluate our model by testing the accuracy on a larger sample.

# In[19]:


from sklearn.cross_validation import train_test_split

training_frac = 0.05
test_frac = 0.8

training_data, the_rest = train_test_split(training_data_full, train_size=training_frac, random_state=0)
test_data = the_rest.sample(frac=test_frac)


# In[20]:


training_data.shape


# In[21]:


test_data.shape


# In[22]:


wrangled = preprocessor.fit_transform(training_data)


# In[23]:


wrangled.head()


# ## Putting together classifiers

# In[24]:


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
        ('wrangle', preprocessor),
        ('lr', LogisticRegression(C=100.0, random_state=0))
    ])

pipe_rf = Pipeline([
        ('wrangle', preprocessor),
        ('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0))
    ])


# In[25]:


feature_columns = cat_columns + q_columns 


# In[26]:


def extract_X_y(df):
    return df[feature_columns], df['outcome']

X_train, y_train = extract_X_y(training_data)
X_test, y_test = extract_X_y(test_data)


# ### Reporting utilities
# 
# Some utilities to make reporting progress easier

# In[48]:


import time
import subprocess

class time_and_log():
    
    def __init__(self, label, *, prefix='', say=False):
        self.label = label
        self.prefix = prefix
        self.say = say
    
    def __enter__(self):
        msg = 'Starting {}'.format(self.label)
        print('{}{}'.format(self.prefix, msg))
        if self.say:
            cmd_say(msg)
        self.start = time.process_time()
        return self

    def __exit__(self, *exc):
        self.interval = time.process_time() - self.start
        msg = 'Finished {} in {:.2f} seconds'.format(self.label, self.interval)
        print('{}{}'.format(self.prefix, msg))
        if self.say:
            cmd_say(msg)
        return False
    
def cmd_say(msg):
    subprocess.call("say '{}'".format(msg), shell=True)


# ### Cross validation and full test set accuracy
# 
# We'll cross validate within the training set, and then train on the full training set and see how well it performs on the full test set.

# In[50]:


from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
import numpy as np

models = [
    ('logistic regression', pipe_lr), 
    ('random forest', pipe_rf), 
]

for label, model in models:
    print('Evaluating {}'.format(label))
    say('Evaluating {}'.format(label))
#     with time_and_log('cross validating', say=True, prefix=" _"):
#         scores = cross_val_score(estimator=model,
#                              X=X_train,
#                              y=y_train,
#                              cv=5,
#                              n_jobs=1)
#         print('  CV accuracy: {:.3f} +/- {:.3f}'.format(np.mean(scores), np.std(scores)))
    with time_and_log('fitting full training set', say=True, prefix=" _"):
        model.fit(X_train, y_train)  
    with time_and_log('evaluating on full test set', say=True, prefix=" _"):
        print("  Full test accuracy ({:.2f} of dataset): {:.3f}".format(
                test_frac, 
                accuracy_score(y_test, model.predict(X_test)))) 


# ## Preparing the submission
# 
# Random forest beat logistic regression, let's start with a submission using that.
# 
# But first, let's see what the submission is supposed to look like:

# In[56]:


pd.read_csv('sample_submission.csv.zip').head(5)


# And now let's prepare the submission by fitting on the full provided training set and using it to predict on the provided test set.

# In[57]:


kaggle_test_df = pd.merge(
    pd.read_csv('act_test.csv.zip'), 
    people, 
    how='inner', on='people_id', suffixes=['_action', '_person'], sort=False)
kaggle_test_df.head(2)


# In[55]:


kaggle_test_df.shape


# In[58]:


X_kaggle_train, y_kaggle_train = extract_X_y(training_data_full)


# In[59]:


with time_and_log('fitting rf on full kaggle training set', say=True): 
    pipe_rf.fit(X_kaggle_train, y_kaggle_train)


# In[60]:


with time_and_log('preparing kaggle submission', say=True):
    submission_df = kaggle_test_df[['activity_id']].copy()
    submission_df['outcome'] = pipe_rf.predict(kaggle_test_df)
    submission_df.to_csv("predicting-red-hat-business-value_1_rf.csv", index=False)


# This got me to 85% accuracy on the submission, placing 1099 out of 1250 teams. There are 190 people with 99% or greater accuracy and 837 with 95%, so this definitely qualifies as merely a quick and dirty submission :)
# 
# It's also worth noting that [apparently people figured](https://www.kaggle.com/c/predicting-red-hat-business-value/forums/t/22898/updated-competition-deadline) out how to get 98% only looking at the date and group columns—two of the columns I ditched to make things easier to get started.