from IPython.display import Image Image('https://kaggle2.blob.core.windows.net/competitions/kaggle/3966/media/overview-AfSIS-kaggle.png', width=700) import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn %pylab inline from sklearn.grid_search import GridSearchCV from pprint import pprint import cPickle from datetime import datetime from sklearn import preprocessing from sklearn import linear_model from sklearn import svm # reding the csv file and setting sirts column as index soil = pd.read_csv('./train/training.csv', index_col = 0) # printing size of data frame print 'Size of data frame: ', soil.shape # summary of the databy column soil.describe() # printing the first five rows of the whole data frame soil.head() # checking whether index is unique soil.index.is_unique # defininig a function to compute missing values def count_missing(frame): return (frame.shape[0] * frame.shape[1]) - frame.count().sum() # checking for missing values in data frame print 'Missing Values: ', count_missing(soil) # getting a list of all the columns starting with 'm' wavecolumns = [wn for wn in soil.columns.values if wn.startswith('m')] # getting 20 random rows randomrows = np.random.random_integers(0,soil.shape[0]-1,20) # getting the indices of the random rows selected indices = [soil.index.values[index] for index in randomrows] # plotting fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18,70), sharex=False) fig.subplots_adjust(hspace = .08, wspace=.1) axes = axes.ravel() for i, index in enumerate(indices): (soil.loc[index,wavecolumns[0]:wavecolumns[-1]]).plot(ax=axes[i], lw=2.) axes[i].legend((index,),loc='upper left') axes[i].set_ylim(0,2.5) soil.columns[-21:] histog = [col for col in soil.columns[-21:].values if col != 'Depth'] fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18,70), sharex=False) fig.subplots_adjust(hspace = .08, wspace=.1) axes = axes.ravel() for i, col in enumerate(histog): if i > 14: soil[col].hist(ax=axes[i], bins=18, color='red') leg = col + ' (target of regression)' axes[i].legend((leg,),loc='upper right') else: soil[col].hist(ax=axes[i], bins=18) axes[i].legend((col,),loc='upper right') fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(18,70), sharex=False) fig.subplots_adjust(hspace = .2, wspace=.07) axes = axes.ravel() for i, col in enumerate(histog): if i > 14: soil.boxplot(ax=axes[i], column=col, by='Depth') leg = col + ' (target of regression)' axes[i].legend((leg,),loc='upper right') for soiltype in ['Subsoil', 'Topsoil']: y = soil[col][soil.Depth == soiltype] for mean in [1,2]: x = np.random.normal(mean, 0.04, size=len(y)) axes[i].plot(x, y, 'r.', alpha=0.08) else: soil.boxplot(ax=axes[i], column=col, by='Depth') axes[i].legend((col,),loc='upper right') for soiltype in ['Subsoil', 'Topsoil']: y = soil[col][soil.Depth == soiltype] for mean in [1,2]: x = np.random.normal(mean, 0.04, size=len(y)) axes[i].plot(x, y, 'r.', alpha=0.08) # generating dummy variables out of Depth soildepth = pd.get_dummies(soil['Depth']) soildepth.head(10) # plotting 50 consecutive entries os Subsoil soildepth.Subsoil[1000:1050].plot(figsize=(10, 8)) # plotting the first 30 entries os REF3 feature. As you can see consecutive values are paired, probably meaning that soil # samples are being collected by the same exact point at two different depths. This also explaines the fact that boxplots # of the same predictor split by Depth are not so different one to the other soil.REF3[:30].plot(figsize=(10, 8)) # plotting the whole REF3 feature. soil.REF3.plot(figsize=(10, 8)) # more than the previous feature we would probably like something like the following. Randomized data. # we'll take care of the shuffling during the Machine Learning pipeline. pd.DataFrame(soil.REF3).apply(np.random.permutation).plot(figsize=(10, 8)) def computeMCRMSE(y_actual, y_predicted): """ computes the mean columnwise root mean squared error given 5-column prediction and 5-column actual value """ n = len(y_actual[0]) summ = 0 for i in range(len(y_actual)): summ += np.sqrt(((np.array(y_actual[i]) - y_predicted[i])**2).sum()/n) return summ/5 def performParameterSelection(model_name, X, y): """ performs gridSearchCV on train set with specific model and parameters """ model, param_grid = MODELS[model_name] gs = GridSearchCV(model, param_grid, n_jobs=-1, cv=5, verbose=1) gs.fit(X, y) pprint(sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score)) fname_out = '{}-{}-{}.pickle'.format(model_name, y.name, datetime.now()) with open(fname_out, 'wb') as fout: cPickle.dump(gs, fout, -1) print "Saved model to {}".format(fname_out) def getDummiedSoilDepth(df): """ get dummy variables from binary categorical Soil Depth (Subsoil, Supsoil). Then drop categorical column. """ soildepth = pd.get_dummies(df['Depth']) df.insert(0,'Subsoil',soildepth['Subsoil']) df.insert(0,'Topsoil',soildepth['Topsoil']) df = df.drop('Depth', 1) return df def shuffleData(df): """ random shuffle dataframe cause entries are paired in Depth """ df = df.reindex(np.random.permutation(df.index)) return df def splitTrainTest(dataframe, percentage): """ splits shuffled dataframe in train and test set according to percentage (float). """ index = int(np.floor(dataframe.shape[0]*percentage)) predictors = dataframe.iloc[:,:-5] predictors = pd.DataFrame(preprocessing.scale(predictors), index=predictors.index, columns=predictors.columns) output = dataframe.iloc[:,-5:] X_train = predictors[:index] y_train = output[:index] Ca_train = y_train['Ca'] P_train = y_train['P'] pH_train = y_train['pH'] SOC_train = y_train['SOC'] Sand_train = y_train['Sand'] X_test = predictors[index:] y_test = output[index:] Ca_test = y_test['Ca'] P_test = y_test['P'] pH_test = y_test['pH'] SOC_test = y_test['SOC'] Sand_test = y_test['Sand'] return X_train, Ca_train, P_train, pH_train, SOC_train, Sand_train, X_test, Ca_test, P_test, pH_test, SOC_test, Sand_test def performSVR(X_train, y_train, X_test, parameters): """ SVR regression """ C = parameters[0] epsilon = parameters[1] kernel = parameters[2] clf = svm.SVR(C=C, epsilon=epsilon, kernel= kernel) clf.fit(X_train, y_train) fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now()) with open(fname_out, 'wb') as f: cPickle.dump(clf, f, -1) return clf.predict(X_test) def performRidge(X_train, y_train, X_test, parameters): """ Ridge regression """ alpha = parameters[0] clf = linear_model.Ridge(alpha=alpha) clf.fit(X_train, y_train) fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now()) with open(fname_out, 'wb') as f: cPickle.dump(clf, f, -1) return clf.predict(X_test) def performLasso(X_train, y_train, X_test, parameters): """ Lasso regression """ alpha = parameters[0] clf = linear_model.Lasso()(alpha=alpha) clf.fit(X_train, y_train) fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now()) with open(fname_out, 'wb') as f: cPickle.dump(clf, f, -1) return clf.predict(X_test) def performENET(X_train, y_train, X_test, parameters): """ ElasticNet regression """ alpha = parameters[0] l1_ratio = parameters[1] clf = linear_model.ElasticNet()(alpha=alpha, l1_ratio=l1_ratio) clf.fit(X_train, y_train) fname_out = '{}-{}final.pickle'.format(y_train.name, datetime.now()) with open(fname_out, 'wb') as f: cPickle.dump(clf, f, -1) return clf.predict(X_test) # Scikit-Learn Algorithm Cheat Sheet from IPython.core.display import HTML HTML("") MODELS = { 'lasso': ( linear_model.Lasso(), {'alpha': [0.001, 0.01, 0.1, 1]}, ), 'ridge': ( linear_model.Ridge(), {'alpha': [0.001, 0.01, 0.1, 1]}, ), 'elasticnet': ( linear_model.ElasticNet(), {'alpha': [0.001, 0.01, 0.1, 1], 'l1_ratio': [0.001, 0.001, 0.01, 0.1, 1]}, ), 'svr': ( svm.SVR(), {'C': [0.001, 0.01, 0.1, 1], 'epsilon': [0.001, 0.01, 0.1, 1], 'kernel': ['linear']}, ) } # reread the original csv file to overwrite the soil data frame and start everything again soil = pd.read_csv('./train/training.csv', index_col = 0) # generates dummy variables out of Depth feature soil = getDummiedSoilDepth(soil) # setting seed and shuffling data frame np.random.seed(2013) soil = shuffleData(soil) # generate all test and train set with target variables X_train, Ca_train, P_train, pH_train, SOC_train, Sand_train, X_test, Ca_test, P_test, pH_test, SOC_test, Sand_test = splitTrainTest(soil, 0.8) trainCV = [Ca_train, P_train, pH_train, SOC_train, Sand_train] # for each target variable a CV on train set will be performed. # SearchGrid will print to screen but in the same time a log file with all results will be saved. # Each best model for each algorithm and target will be saved ####################################################################################################### # 2 LINES COMMENTED OUT IN THE FOLLOWING CODE TO PREVENT TO SAVE TO FILE. # IF YOU WANT TO HAVE EVERYTHING RUNNING PROPERLY ####################################################################################################### for y_tr in trainCV: logname = './{}-{}.txt'.format(y_tr.name, datetime.now()) #sys.stdout = open(logname, 'w') for model in MODELS.keys(): pass #print '=======================================' #print y_tr.name, model #print '=======================================' #performParameterSelection(model, X_train, y_tr) from IPython.core.display import Image Image(filename='cv.png', width=700) # test set with all the test targets previously generated test = [X_test, Ca_test, P_test, pH_test, SOC_test, Sand_test] # best models best_Ca = './Sep25/svr-Ca-2014-09-25 08:11:09.934405.pickle' best_P = './Sep25/svr-P-2014-09-25 08:31:53.611168.pickle' best_pH = './Sep25/ridge-pH-2014-09-25 08:48:25.247992.pickle' best_SOC = './Sep25/ridge-SOC-2014-09-25 09:06:50.757604.pickle' best_Sand = './Sep25/svr-Sand-2014-09-25 09:23:22.520904.pickle' best_models = [best_Ca, best_P, best_pH, best_SOC, best_Sand] y_predicted = [] # every model is used to be applied on the test set to predict the target value for model in best_models: with open(model, 'rb') as fin: mod = cPickle.load(fin) y_predicted.append(mod.predict(X_test)) # comput the mean columnwise root mean squared error print 'MCRMSE: ', computeMCRMSE(test[1:], y_predicted)