from IPython.display import Image
Image('https://kaggle2.blob.core.windows.net/competitions/kaggle/3966/media/overview-AfSIS-kaggle.png', width=700)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
%pylab inline
from sklearn.grid_search import GridSearchCV
from pprint import pprint
import cPickle
from datetime import datetime
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm

# reding the csv file and setting sirts column as index
soil = pd.read_csv('./train/training.csv', index_col = 0)
# printing size of data frame
print 'Size of data frame: ', soil.shape 
# summary of the databy column
soil.describe()

# printing the first five rows of the whole data frame
soil.head()

# checking whether index is unique
soil.index.is_unique

# defininig a function to compute missing values
def count_missing(frame):
    return (frame.shape[0] * frame.shape[1]) - frame.count().sum()
# checking for missing values in data frame
print 'Missing Values: ', count_missing(soil)

# getting a list of all the columns starting with 'm'
wavecolumns = [wn for wn in soil.columns.values if wn.startswith('m')]
# getting 20 random rows
randomrows = np.random.random_integers(0,soil.shape[0]-1,20)
# getting the indices of the random rows selected 
indices = [soil.index.values[index] for index in randomrows]
# plotting
fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18,70), sharex=False)
fig.subplots_adjust(hspace = .08, wspace=.1)
axes = axes.ravel()
for i, index in enumerate(indices):
    (soil.loc[index,wavecolumns[0]:wavecolumns[-1]]).plot(ax=axes[i], lw=2.)
    axes[i].legend((index,),loc='upper left')
    axes[i].set_ylim(0,2.5)

soil.columns[-21:]

histog = [col for col in soil.columns[-21:].values if col != 'Depth']
fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18,70), sharex=False)
fig.subplots_adjust(hspace = .08, wspace=.1)
axes = axes.ravel()
for i, col in enumerate(histog):
    if i > 14:
        soil[col].hist(ax=axes[i], bins=18, color='red')
        leg = col + ' (target of regression)'
        axes[i].legend((leg,),loc='upper right')
    else:
        soil[col].hist(ax=axes[i], bins=18)
        axes[i].legend((col,),loc='upper right')    

fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18,70), sharex=False)
fig.subplots_adjust(hspace = .2, wspace=.07)
axes = axes.ravel()
for i, col in enumerate(histog):
    if i > 14:
        soil.boxplot(ax=axes[i], column=col, by='Depth')
        leg = col + ' (target of regression)'
        axes[i].legend((leg,),loc='upper right')
        for soiltype in ['Subsoil', 'Topsoil']:
            y = soil[col][soil.Depth == soiltype]
            for mean in [1,2]:
               x = np.random.normal(mean, 0.04, size=len(y))
               axes[i].plot(x, y, 'r.', alpha=0.08)
    else:
        soil.boxplot(ax=axes[i], column=col, by='Depth')
        axes[i].legend((col,),loc='upper right')
        for soiltype in ['Subsoil', 'Topsoil']:
            y = soil[col][soil.Depth == soiltype]
            for mean in [1,2]:
               x = np.random.normal(mean, 0.04, size=len(y))
               axes[i].plot(x, y, 'r.', alpha=0.08)

# generating dummy variables out of Depth
soildepth = pd.get_dummies(soil['Depth'])
soildepth.head(10)

# plotting 50 consecutive entries os Subsoil
soildepth.Subsoil[1000:1050].plot(figsize=(10, 8))

# plotting the first 30 entries os REF3 feature. As you can see consecutive values are paired, probably meaning that soil 
# samples are being collected by the same exact point at two different depths. This also explaines the fact that boxplots 
# of the same predictor split by Depth are not so different one to the other
soil.REF3[:30].plot(figsize=(10, 8))

# plotting the whole REF3 feature. 
soil.REF3.plot(figsize=(10, 8))

# more than the previous feature we would probably like something like the following. Randomized data.
# we'll take care of the shuffling during the Machine Learning pipeline.
pd.DataFrame(soil.REF3).apply(np.random.permutation).plot(figsize=(10, 8))

def computeMCRMSE(y_actual, y_predicted):
    """
    computes the mean columnwise root mean squared error given 
    5-column prediction and 5-column actual value
    """
    n = len(y_actual[0])
    summ = 0
    for i in range(len(y_actual)):
        summ += np.sqrt(((np.array(y_actual[i]) - y_predicted[i])**2).sum()/n)
    return summ/5    

def performParameterSelection(model_name, X, y):
    """
    performs gridSearchCV on train set with specific model and parameters
    """
    model, param_grid = MODELS[model_name]
    gs = GridSearchCV(model, param_grid, n_jobs=-1, cv=5, verbose=1)
    gs.fit(X, y)
    
    pprint(sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score))
    
    fname_out = '{}-{}-{}.pickle'.format(model_name, y.name, datetime.now())
    with open(fname_out, 'wb') as fout:
        cPickle.dump(gs, fout, -1)

    print "Saved model to {}".format(fname_out)

def getDummiedSoilDepth(df):
    """
    get dummy variables from binary categorical Soil Depth (Subsoil, Supsoil).
    Then drop categorical column.
    """ 
    soildepth = pd.get_dummies(df['Depth'])
    df.insert(0,'Subsoil',soildepth['Subsoil'])
    df.insert(0,'Topsoil',soildepth['Topsoil'])
    df = df.drop('Depth', 1)
    return df

def shuffleData(df):
    """
    random shuffle dataframe cause entries are paired in Depth
    """
    df = df.reindex(np.random.permutation(df.index))
    return df

def splitTrainTest(dataframe, percentage):
    """
    splits shuffled dataframe in train and test set according to percentage (float).
    """
    index = int(np.floor(dataframe.shape[0]*percentage))
    predictors = dataframe.iloc[:,:-5]
    predictors = pd.DataFrame(preprocessing.scale(predictors), index=predictors.index, columns=predictors.columns)
    output = dataframe.iloc[:,-5:]
    
    X_train = predictors[:index]
    y_train = output[:index]
    Ca_train = y_train['Ca']
    P_train = y_train['P']
    pH_train = y_train['pH']
    SOC_train = y_train['SOC']
    Sand_train = y_train['Sand']
    
    
    X_test = predictors[index:]
    y_test = output[index:]
    Ca_test = y_test['Ca']
    P_test = y_test['P']
    pH_test = y_test['pH']
    SOC_test = y_test['SOC']
    Sand_test = y_test['Sand']
    
    return X_train, Ca_train, P_train, pH_train, SOC_train, Sand_train, X_test, Ca_test, P_test, pH_test, SOC_test, Sand_test

def performSVR(X_train, y_train, X_test, parameters):
    """
    SVR regression
    """
    C = parameters[0]
    epsilon =  parameters[1]
    kernel = parameters[2]
    clf = svm.SVR(C=C, epsilon=epsilon, kernel= kernel)
    clf.fit(X_train, y_train)

    fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now())
    with open(fname_out, 'wb') as f:
        cPickle.dump(clf, f, -1)    

    return clf.predict(X_test)

def performRidge(X_train, y_train, X_test, parameters):
    """
    Ridge regression
    """
    alpha =  parameters[0]
    clf = linear_model.Ridge(alpha=alpha)
    clf.fit(X_train, y_train)

    fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now())
    with open(fname_out, 'wb') as f:
        cPickle.dump(clf, f, -1) 

    return clf.predict(X_test)

def performLasso(X_train, y_train, X_test, parameters):
    """
    Lasso regression
    """
    alpha =  parameters[0]
    clf = linear_model.Lasso()(alpha=alpha)
    clf.fit(X_train, y_train)

    fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now())
    with open(fname_out, 'wb') as f:
        cPickle.dump(clf, f, -1)
     
    return clf.predict(X_test)

def performENET(X_train, y_train, X_test, parameters):
    """
    ElasticNet regression
    """
    alpha =  parameters[0]
    l1_ratio = parameters[1]
    clf = linear_model.ElasticNet()(alpha=alpha, l1_ratio=l1_ratio)
    clf.fit(X_train, y_train)

    fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now())
    with open(fname_out, 'wb') as f:
        cPickle.dump(clf, f, -1)

    return clf.predict(X_test)


# Scikit-Learn Algorithm Cheat Sheet
from IPython.core.display import HTML
HTML("<iframe src=http://scikit-learn.org/stable/tutorial/machine_learning_map/ width=1000 height=600></iframe>")

MODELS = {
    'lasso': (
        linear_model.Lasso(),
        {'alpha': [0.001, 0.01, 0.1, 1]},
        ),
    'ridge': (
        linear_model.Ridge(),
        {'alpha': [0.001, 0.01, 0.1, 1]},
        ),
    'elasticnet': (
        linear_model.ElasticNet(),
        {'alpha': [0.001, 0.01, 0.1, 1],
         'l1_ratio': [0.001, 0.001, 0.01, 0.1, 1]},
        ),
    'svr': (
        svm.SVR(),
        {'C': [0.001, 0.01, 0.1, 1],
         'epsilon': [0.001, 0.01, 0.1, 1],
         'kernel': ['linear']},
        )
    }

# reread the original csv file to overwrite the soil data frame and start everything again
soil = pd.read_csv('./train/training.csv', index_col = 0)
# generates dummy variables out of Depth feature
soil = getDummiedSoilDepth(soil)
# setting seed and shuffling data frame 
np.random.seed(2013)
soil = shuffleData(soil)
# generate all test and train set with target variables  
X_train, Ca_train, P_train, pH_train, SOC_train, Sand_train, X_test, Ca_test, P_test, pH_test, SOC_test, Sand_test = splitTrainTest(soil, 0.8)

trainCV = [Ca_train, P_train, pH_train, SOC_train, Sand_train]

# for each target variable a CV on train set will be performed.
# SearchGrid will print to screen but in the same time a log file with all results will be saved.
# Each best model for each algorithm and target will be  saved

#######################################################################################################
# 2 LINES COMMENTED OUT IN THE FOLLOWING CODE TO PREVENT TO SAVE TO FILE.
# IF YOU WANT TO HAVE EVERYTHING RUNNING PROPERLY 
#######################################################################################################
for y_tr in trainCV:
    logname = './{}-{}.txt'.format(y_tr.name, datetime.now())
    #sys.stdout = open(logname, 'w')
    for model in MODELS.keys():
        pass
        #print '======================================='
        #print y_tr.name, model
        #print '======================================='
        #performParameterSelection(model, X_train, y_tr)

from IPython.core.display import Image 
Image(filename='cv.png', width=700) 

# test set with all the test targets previously generated
test = [X_test, Ca_test, P_test, pH_test, SOC_test, Sand_test]

# best models
best_Ca = './Sep25/svr-Ca-2014-09-25 08:11:09.934405.pickle'
best_P = './Sep25/svr-P-2014-09-25 08:31:53.611168.pickle'
best_pH = './Sep25/ridge-pH-2014-09-25 08:48:25.247992.pickle'    
best_SOC = './Sep25/ridge-SOC-2014-09-25 09:06:50.757604.pickle'
best_Sand = './Sep25/svr-Sand-2014-09-25 09:23:22.520904.pickle'

best_models = [best_Ca, best_P, best_pH, best_SOC, best_Sand]
y_predicted = []

# every model is used to be applied on the test set to predict the target value
for model in best_models:
    with open(model, 'rb') as fin:
        mod = cPickle.load(fin)
        y_predicted.append(mod.predict(X_test))        

# comput the mean columnwise root mean squared error
print 'MCRMSE: ', computeMCRMSE(test[1:], y_predicted)