Load the basic libraries
import numpy as np
import pandas as pd
The decimal separator has been already changed from a comma to a dot.
We load the training and the test data.
data = pd.read_csv("Predictive Modelling Train_dot.txt", sep="|", index_col="ID")
len(data.columns)
data.columns
data_test = pd.read_csv("Predictive Modelling Test_dot.txt", sep="|", index_col="ID")
data_test.columns
The classification data is separated in the training dataset. It is saved in the extra column with a different name called "TARGET".
target = data["TARGET"]
del data["TARGET"]
The original training dataset is partitioned in a training part with 75% of the data and a testing part with 25% of the data. The split is done randomly to avoid problems with the posible underlying ordering of the features.
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.astype(np.float64).values,
target.astype(np.float64).values, train_size=0.75, test_size=0.25)
We will use TPOT to find the optimal classifier and its parameters.
from tpot import TPOTClassifier
pipeline_optimizer = TPOTClassifier(generations=1, population_size=20, num_cv_folds=5, random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)
pipeline_optimizer.export("pipeline2_1gen.py")
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline
nb_pipeline = make_pipeline(
BernoulliNB(alpha=0.1, binarize=0.46, fit_prior=True)
)
nb_pipeline.fit(X_train, y_train)
nb_pipeline.score(data.astype(np.float64).values, target.astype(np.float64).values)
result_nb = nb_pipeline.predict(data.astype(np.float64).values)
import matplotlib.pyplot as plt
from sklearn import metrics
%matplotlib inline
fpr_nb, tpr_nb, thresholds_nb = metrics.roc_curve(target.astype(np.float64).values, result_nb)
auc_nb = metrics.auc(fpr_nb, tpr_nb)
print "False positive rate: {:6.4f}".format(fpr_nb[1])
print "True positive rate: {:6.4f}".format(tpr_nb[1])
print "Area under the curve: {:6.4f}".format(auc_nb)
lw = 2
plt.plot(fpr_nb, tpr_nb, color='darkorange', lw=lw, label="AUC: {:6.4f}".format(auc_nb))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic BernoulliNB Classifier')
plt.legend(loc="lower right")
from sklearn.ensemble import GradientBoostingClassifier
gb_pipeline = make_pipeline(
GradientBoostingClassifier(learning_rate=0.83, max_features=0.83, n_estimators=500)
)
gb_pipeline.fit(X_train, y_train)
gb_pipeline.score(data.astype(np.float64).values, target.astype(np.float64).values)
result_gb = gb_pipeline.predict(data.astype(np.float64).values)
Check the number of missclassifications
np.sum(np.abs(target.astype(np.float64).values-result_gb)) # Number of errors
fpr_gb, tpr_gb, thresholds_gb = metrics.roc_curve(target.astype(np.float64).values, result_gb)
auc_gb = metrics.auc(fpr_gb, tpr_gb)
plt.plot(fpr_gb, tpr_gb, color='darkorange', lw=lw, label="AUC: {:6.4f}".format(auc_gb))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic Gradient Boosting Classifier')
plt.legend(loc="lower right")
from sklearn.ensemble import RandomForestClassifier
rf_pipeline = make_pipeline(
RandomForestClassifier(n_estimators=500)
)
rf_pipeline.fit(X_train, y_train)
rf_pipeline.score(data.astype(np.float64).values, target.astype(np.float64).values)
result_rf = rf_pipeline.predict(data.astype(np.float64).values)
Number of missclassifications
np.sum(np.abs(target.astype(np.float64).values-result_rf))
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(target.astype(np.float64).values, result_rf)
auc_rf = metrics.auc(fpr_rf, tpr_rf)
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=lw, label="AUC: {:6.4f}".format(auc_rf))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic Random Forest Classifier')
plt.legend(loc="lower right")
lw = 2
plt.plot(fpr_nb, tpr_nb, color='darkorange', lw=lw, label="AUC NB: {:6.4f}".format(auc_nb))
plt.plot(fpr_gb, tpr_gb, color='red', lw=lw, label="AUC GB: {:6.4f}".format(auc_gb))
plt.plot(fpr_rf, tpr_rf, color='green', lw=lw, label="AUC RF: {:6.4f}".format(auc_rf))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic comparison')
plt.legend(loc="lower right")
plt.savefig("roc.png", dpi=300)
plt.savefig("roc.svg")
We check if the positive results are clustered at the end of the test array.
def per_results(result):
return np.sum(result)/len(result)
result_test_rf = rf_pipeline.predict(data_test.astype(np.float64).values)
per_results(result_test_rf)
result_test_rf.shape
plt.plot(result_test_rf)
plt.xlabel("record")
plt.ylabel("Predicted TARGET")
plt.savefig("predict.png", dpi=300)
plt.savefig("predict.svg")
plt.plot(result_test_rf[517:202517].reshape((202,1000)).sum(axis=1))
The data with positive values is clustered around the end of the array confirming the possible problem with the randomization of the data.
result_test_nb = nb_pipeline.predict(data_test.astype(np.float64).values)
len(result_test_nb)
np.sum(result_test_nb)
per_results(result_test_nb)
t = target.astype(np.float64).values
np.sum(t)/len(t)
result_test_gb = gb_pipeline.predict(data_test.astype(np.float64).values)
np.sum(np.abs(result_test_gb-result_test_rf))
len(result_test_gb)
np.sum(result_test_gb)
per_results(result_test_gb)
gb_pipeline2 = make_pipeline(
GradientBoostingClassifier(learning_rate=0.83, max_features=0.83, n_estimators=500)
)
data['ID'] = data.index
X_train2, X_test2, y_train2, y_test2 = train_test_split(data.astype(np.float64).values,
target.astype(np.float64).values, train_size=0.75, test_size=0.25)
gb_pipeline2.fit(X_train2, y_train2)
gb_pipeline2.score(data.astype(np.float64).values, target.astype(np.float64).values)
result_gb2 = gb_pipeline2.predict(data.astype(np.float64).values)
fpr_gb2, tpr_gb2, thresholds_gb2 = metrics.roc_curve(target.astype(np.float64).values, result_gb2)
auc_gb2 = metrics.auc(fpr_gb2, tpr_gb2)
print "False positive rate: {:6.4f}".format(fpr_gb2[1])
print "True positive rate: {:6.4f}".format(tpr_gb2[1])
print "Area under the curve: {:6.4f}".format(auc_gb2)
plt.plot(fpr_gb2, tpr_gb2, color='darkorange', lw=lw, label="AUC: {:6.4f}".format(auc_gb2))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic Gradient Boosting Classifier')
plt.legend(loc="lower right")
plt.savefig("final_roc.png", dpi=300)
plt.savefig("final_roc.svg")
data_test['ID'] = data_test.index
Prediction for the test sample
result_test_gb2 = gb_pipeline2.predict(data_test.astype(np.float64).values)
result_test_ex = np.zeros(len(result_test_rf))
#change_index = int(round(len(result_test_rf)*(1.-np.sum(t)/len(t))))
change_index = 199352
result_test_ex[change_index:] = 1
np.sum(np.abs(result_test_rf-result_test_ex))
np.sum(np.abs(result_test_gb2-result_test_ex))
import json
import codecs
def export_json(dataset, filename, save_numpy=True, save_respuesta=True):
json.dump(dataset.astype(int).tolist(),
codecs.open(filename+".json", 'w', encoding='utf-8'),
separators=(',', ':'),
sort_keys=True,
indent=4)
if save_numpy:
np.save(filename+".npy", dataset.astype(int))
if save_respuesta:
data_test["Respuesta"] = dataset.astype(int)
data_test[["Respuesta"]].to_csv(filename+".txt", sep="|")
export_json(result_test_nb, "result_nb")
export_json(result_test_gb, "result_gb")
export_json(result_test_rf, "result_rf")
export_json(result_test_ex, "result_ex")
export_json(result_test_gb2, "result_final")
#result = np.array(json.loads(codecs.open(file_path, 'r', encoding='utf-8').read()))
We also export the final data in the required format
data_test["Respuesta"] = result_test_gb2.astype(int)
#data_test["Id"] = data_test["ID"]
data_test[["Respuesta"]].to_csv("respuesta.txt", sep="|")