# -*- coding: utf-8 -*-
import scipy.misc as mpimg
import numpy as np
import time
import datetime
import csv
from sklearn import svm
import numpy.ma as ma
import argparse
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
import os.path
import tifffile as tiff
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
# Feature Selection and Encoding
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize
# Machine learning
import sklearn.ensemble as ske
from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
# Grid and Random Search
import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Metrics
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc
# Managing Warnings
import warnings
warnings.filterwarnings('ignore')
def readdata():
Path = "/mnt/datapool/RemoteSensingData/S1_DV/radasat2/r2_zip/NewFolder/GF3/"
gt=tiff.imread(Path+"gt.tif")
img=tiff.imread(Path+"img.tif")
hh=img[:,:,0]
hv=img[:,:,1]
datah=hh.reshape(-1)
datav=hv.reshape(-1)
label=gt.reshape(-1)
datah=datah[np.where(label<>255)]
datav=datav[np.where(label<>255)]
data=np.vstack(np.stack((datah, datav), axis=-1))
label=label[label<>255]# 255 NonData
#[l.index(l) for i in l if l == 0]
#print(datah[:100])
#print(datah.shape)
print(label.shape)
print(data.shape)
return(data,label)
X,y=readdata()
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.3, random_state=0)
print (X_train.shape)
print(y_train.shape)
(286066, 2) (286066,)
def plot_roc_curve(y_test, preds):
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
def fit_ml_algo(algo, X_train, y_train, X_test, cv):
# One Pass
model = algo.fit(X_train, y_train)
test_pred = model.predict(X_test)
if (isinstance(algo, (LogisticRegression,
KNeighborsClassifier,
GaussianNB,
DecisionTreeClassifier,
RandomForestClassifier,
GradientBoostingClassifier))):
probs = model.predict_proba(X_test)[:,1]
else:
probs = "Not Available"
acc = round(model.score(X_test, y_test) * 100, 2)
# CV
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
n_jobs = -1)
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, test_pred, acc, acc_cv, probs
# Logistic Regression - Random Search for Hyperparameters
# Utility function to report best scores
def report(results, n_top=5):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# Specify parameters and distributions to sample from
param_dist = {'penalty': ['l2', 'l1'],
'class_weight': [None, 'balanced'],
'C': np.logspace(-20, 20, 10000),
'intercept_scaling': np.logspace(-20, 20, 10000)}
# Run Randomized Search
n_iter_search = 10
lrc = LogisticRegression()
random_search = RandomizedSearchCV(lrc,
n_jobs=-1,
param_distributions=param_dist,
n_iter=n_iter_search)
start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
RandomizedSearchCV took 23.10 seconds for 10 candidates parameter settings. Model with rank: 1 Mean validation score: 0.943 (std: 0.001) Parameters: {'penalty': 'l1', 'C': 0.18110306173696183, 'intercept_scaling': 1.0698008294791242e-07, 'class_weight': 'balanced'} Model with rank: 2 Mean validation score: 0.940 (std: 0.001) Parameters: {'penalty': 'l2', 'C': 61727.551134160356, 'intercept_scaling': 1.9816625542394187e-10, 'class_weight': 'balanced'} Model with rank: 3 Mean validation score: 0.940 (std: 0.000) Parameters: {'penalty': 'l1', 'C': 6.134494449932439e+17, 'intercept_scaling': 3.6387824850747874e-11, 'class_weight': 'balanced'} Model with rank: 4 Mean validation score: 0.930 (std: 0.000) Parameters: {'penalty': 'l2', 'C': 8452646356.7554703, 'intercept_scaling': 0.011529386030693178, 'class_weight': None} Model with rank: 5 Mean validation score: 0.930 (std: 0.000) Parameters: {'penalty': 'l2', 'C': 289.89867115042091, 'intercept_scaling': 1.600370075165793e-18, 'class_weight': None}
# Logistic Regression
start_time = time.time()
train_pred_log, test_pred_log, acc_log, acc_cv_log, probs_log = fit_ml_algo(LogisticRegression(n_jobs = -1),
X_train,
y_train,
X_test,
10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))
Accuracy: 93.41 Accuracy CV 10-Fold: 93.22 Running Time: 0:00:16.047096
#k-Nearest Neighbors
start_time = time.time()
train_pred_knn, test_pred_knn, acc_knn, acc_cv_knn, probs_knn = fit_ml_algo(KNeighborsClassifier(n_neighbors = 3,
n_jobs = -1),
X_train,
y_train,
X_test,
10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))
Accuracy: 94.29 Accuracy CV 10-Fold: 94.22 Running Time: 0:02:51.263258
# Gaussian Naive Bayes
start_time = time.time()
train_pred_gaussian, test_pred_gaussian, acc_gaussian, acc_cv_gaussian, probs_gau = fit_ml_algo(GaussianNB(),
X_train,
y_train,
X_test,
10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))
Accuracy: 93.26 Accuracy CV 10-Fold: 93.07 Running Time: 0:00:00.964405
# Linear SVC
start_time = time.time()
train_pred_svc, test_pred_svc, acc_linear_svc, acc_cv_linear_svc, _ = fit_ml_algo(LinearSVC(),
X_train,
y_train,
X_test,
10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))
Accuracy: 92.33 Accuracy CV 10-Fold: 91.66 Running Time: 0:09:26.554808
# Stochastic Gradient Descent
start_time = time.time()
train_pred_sgd, test_pred_sgd, acc_sgd, acc_cv_sgd, _ = fit_ml_algo(SGDClassifier(n_jobs = -1),
X_train,
y_train,
X_test,
10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))
Accuracy: 91.99 Accuracy CV 10-Fold: 91.51 Running Time: 0:00:02.301628
# Decision Tree Classifier
start_time = time.time()
train_pred_dt, test_pred_dt, acc_dt, acc_cv_dt, probs_dt = fit_ml_algo(DecisionTreeClassifier(),
X_train,
y_train,
X_test,
10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))
Accuracy: 96.84 Accuracy CV 10-Fold: 96.34 Running Time: 0:00:03.184184
# Random Forest Classifier
start_time = time.time()
rfc = RandomForestClassifier(n_estimators=10,
min_samples_leaf=2,
min_samples_split=17,
criterion='gini',
max_features=2)
train_pred_rf, test_pred_rf, acc_rf, acc_cv_rf, probs_rf = fit_ml_algo(rfc,
X_train,
y_train,
X_test,
10)
rf_time = (time.time() - start_time)
print("Accuracy: %s" % acc_rf)
print("Accuracy CV 10-Fold: %s" % acc_cv_rf)
print("Running Time: %s" % datetime.timedelta(seconds=rf_time))
Accuracy: 95.06 Accuracy CV 10-Fold: 94.85 Running Time: 0:00:17.790645
# Gradient Boosting Trees
start_time = time.time()
train_pred_gbt, test_pred_gbt, acc_gbt, acc_cv_gbt, probs_gbt = fit_ml_algo(GradientBoostingClassifier(),
X_train,
y_train,
X_test,
10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbt)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))
Accuracy: 95.02 Accuracy CV 10-Fold: 94.88 Running Time: 0:05:33.582986
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
seed = 7
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
'''
'''
LR: 0.904087 (0.108555) LDA: 0.912762 (0.093905) KNN: 0.916538 (0.086706) CART: 0.908201 (0.082164) NB: 0.902763 (0.106315)