#!/usr/bin/env python # coding: utf-8 # In[1]: import os import warnings import time import copy import numpy as np import pandas as pd from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score from sklearn.model_selection import train_test_split, KFold from sklearn.linear_model import LinearRegression, Lasso, lasso_path, lars_path, LassoLarsIC from sklearn.neural_network import MLPRegressor os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #Hide messy TensorFlow warnings warnings.filterwarnings("ignore") #Hide messy Numpy warnings import keras from keras.layers.core import Dense, Activation, Dropout from keras.layers import Input from keras.models import Model from keras.layers.recurrent import LSTM, GRU from keras.regularizers import l1 from keras.models import Sequential from keras.models import load_model #plotly charts from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from plotly.graph_objs import * import plotly.figure_factory as ff init_notebook_mode(connected=True) # In[2]: # splines chart # R code """ testdata <- data.frame(seq(from=1, to = 1000)) testdata['x'] <- seq(from=0, to = 3*pi, length.out=1000) testdata['y'] <- -cos(testdata$x) + testdata$x/(3*pi) + rnorm(1000)*0.25 nrows <- 300 indexes <- sample(nrow(testdata), nrows) indexes <- indexes[order(indexes)] testdata <- testdata[indexes, ] plot(testdata$x, testdata$y, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=2) testdata$fit2 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit2, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=3) testdata$fit3 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit3, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=4) testdata$fit4 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit4, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=5) testdata$fit5 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit5, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=6) testdata$fit6 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit6, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=20) testdata$fit20 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit20, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=40) testdata$fit40 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit40, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=80) testdata$fit80 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit80, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=160) testdata$fit160 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit160, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,df=240) testdata$fit240 <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fit240, main="Scatterplot", xlab="x", ylab="y", pch=1) fit <- smooth.spline(testdata$x,testdata$y,cv=TRUE) testdata$fitcv <- predict(fit, newdata=testdata$x)[2]$y plot(testdata$x, testdata$fitcv, main="Scatterplot", xlab="x", ylab="y", pch=1) write.csv(testdata, "splines.csv") print(fit) """ splines = pd.read_csv('splines.csv') # In[3]: data = splines.values x = data[:,2] y = data[:,3] fit2 = data[:,4] fit3 = data[:,5] fit4 = data[:,6] fit5 = data[:,7] fit6 = data[:,8] fit20 = data[:,9] fit40 = data[:,10] fit80 = data[:,11] fit160 = data[:,12] fit240 = data[:,13] fitcv = data[:,14] plotdata = [] plotdata.append(Scatter(x=x, y=y, name='Raw data', mode = 'markers', marker = dict( size = 3, line = dict( width = 2, color = 'rgba(128, 128, 192, .75)', ) ) ) ) plotdata.append(Scatter(x=x, y=fit2, name='Linear (high bias / low variance)', mode = 'line', line = dict( color = ('orange'), width = 2) )) plotdata.append(Scatter(x=x, y=fit4, name='Underfit (high bias / low variance)', mode = 'line', line = dict( width = 2, color= ('blue'), ) )) plotdata.append(Scatter(x=x, y=fitcv, name='Lowest CV error', mode = 'line', line = dict( color = ('green'), width = 5) ) ) plotdata.append(Scatter(x=x, y=fit240, name='Overfit (high variance / low bias)', mode = 'line', line = dict( width = 2, color= ('rgb(192,0,0)')) )) layout = Layout( yaxis=dict( autorange=True)) fig = Figure(data=plotdata, layout=layout) iplot(fig) # png to save notebook w/static image # In[4]: # create a data set, sin wave plus trend plus random noise nobs = 2000 x = np.linspace(0, 3*np.pi, num=nobs) y = -np.cos(x) + x/(3*np.pi) + np.random.normal(0, 0.25, nobs) x2 = np.linspace(0, 15*np.pi, num=nobs) y1 = -np.sin(x2) + x2/(3*np.pi) + np.random.normal(0, 0.25, nobs) y2 = -np.cos(x2) + x2/(3*np.pi) + np.random.normal(0, 0.25, nobs) # sample 20% to make it more sparse #arr = np.arange(nobs) #np.random.shuffle(arr) #x = arr[:200] #y = y[x] # In[5]: # chart it def mychart(*args): # pass some 2d n x 1 arrays, x, y, z # 1st array is independent vars # reshape to 1 dimensional array x = args[0].reshape(-1) # following are dependent vars plotted on y axis data = [] for i in range(1, len(args)): data.append(Scatter(x=x, y=args[i].reshape(-1), mode = 'markers')) layout = Layout( yaxis=dict( autorange=True)) fig = Figure(data=data, layout=layout) return iplot(fig) # png to save notebook w/static image mychart(x,y) # In[6]: mychart(x2,y1, y2) # In[7]: import pandas as pd pd.read_csv('splines.csv') # In[8]: # fit with sklearn MLPRegressor layer1_sizes=[1,2,4,8] layer2_sizes=[1,2,4,8] import itertools from plotly import tools def run_grid(build_model_fn, layer1_sizes, layer2_sizes, x, y, epochs=None): nrows = len(layer1_sizes) ncols = len(layer2_sizes) hyperparameter_list = list(itertools.product(layer1_sizes, layer2_sizes)) subplot_titles = ["%d units, %d units" % (layer1_size, layer2_size) for (layer1_size, layer2_size) in hyperparameter_list] fig = tools.make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles) for count, (layer1_size, layer2_size) in enumerate(hyperparameter_list): print("Layer 1 units: %d, Layer 2 units %d:" % (layer1_size, layer2_size)) print("Running experiment %d of %d : %d %d" % (count+1, len(hyperparameter_list), layer1_size, layer2_size)) model = build_model_fn(hidden_layer_sizes=(layer1_size, layer2_size), activation='tanh', max_iter=10000, tol=1e-10, solver='lbfgs' ) x = x.reshape(-1,1) if epochs: model.fit(x,y, epochs=EPOCHS) else: model.fit(x,y) y_pred = model.predict(x) train_score = mean_squared_error(y, y_pred) print(train_score) print("%s Train MSE: %s" % (time.strftime("%H:%M:%S"), str(train_score))) print("%s Train R-squared: %.6f" % (time.strftime("%H:%M:%S"), 1-train_score/y.var())) z = model.predict(x) trace = Scatter( x = x.reshape(-1), y = z.reshape(-1), name = 'fit', mode = 'markers', marker = dict(size = 2) ) fig.append_trace(trace, count // nrows + 1, count % ncols +1) return(iplot(fig)) run_grid(MLPRegressor, layer1_sizes, layer2_sizes, x, y) # In[9]: # sklearn MLP regression didn't work as well as expected # let's build our own keras model by wrappping it in sklearn interface def build_ols_model(input_size = 1, hidden_layer_sizes=[4]): main_input = Input(shape=(input_size,), name='main_input') lastlayer=main_input for layer_size in hidden_layer_sizes: lastlayer = Dense(layer_size, kernel_initializer=keras.initializers.glorot_normal(seed=None), bias_initializer=keras.initializers.glorot_normal(seed=None), activation='tanh')(lastlayer) output = Dense(1, activation='linear')(lastlayer) model = Model(inputs=[main_input], outputs=[output]) model.compile(loss="mse", optimizer='rmsprop' ) print(model.summary()) return model # In[10]: EPOCHS=5000 BATCH_SIZE=1000 def run_experiment (model, x, y): models = [] losses = [] for epoch in range(EPOCHS): fit = model.fit( x, y, batch_size=BATCH_SIZE, shuffle=True, epochs=1, verbose=0 ) train_loss = fit.history['loss'][-1] losses.append(train_loss) models.append(copy.copy(model)) bestloss_index = np.argmin(losses) bestloss_value = losses[bestloss_index] if epoch % 100 == 99: print("%s Epoch %d of %d Loss %.6f Best Loss %.6f" % (time.strftime("%H:%M:%S"), epoch+1, EPOCHS, train_loss, bestloss_value)) # stop if loss rises by 20% from best #if train_loss / bestloss_value > 1.2: # print("%s Stopping..." % (time.strftime("%H:%M:%S"))) # break print ("%s Best training loss epoch %d, value %f" % (time.strftime("%H:%M:%S"), bestloss_index, bestloss_value)) model = models[bestloss_index] train_score = model.evaluate(x, y) print(train_score) print("%s Train MSE: %s" % (time.strftime("%H:%M:%S"), str(train_score))) print("%s Train R-squared: %.6f" % (time.strftime("%H:%M:%S"), 1-train_score/y.var())) return mychart(x, y, model.predict(x)) # In[11]: model = build_ols_model(hidden_layer_sizes=[4,4]) run_experiment(model, x, y) # In[12]: # wrap our keras model in sklearn wrapper so we can run grid like above MLPRegressor from keras.wrappers.scikit_learn import KerasRegressor # closure for sklearn wrapper def make_build(layer_sizes): def myclosure(): return build_ols_model(input_size=1, hidden_layer_sizes=layer_sizes) return myclosure def make_keras_model(**kwargs): # make a function that takes hidden layer sizes kwarg and returns estimator of correct layer sizes build_fn = make_build(kwargs['hidden_layer_sizes']) keras_estimator = KerasRegressor(build_fn=build_fn, epochs=500, batch_size=32, verbose=0) return keras_estimator # In[13]: run_grid(make_keras_model, layer1_sizes, layer2_sizes, x, y) # works but not as conistently as expected # expected all to work well but show increasing degrees of freedom # takeaway: feedforward NN a bit hinky for regression # In[14]: # try RandomForestRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV rf = RandomForestRegressor(random_state = 42) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(rf.get_params()) # First 2 are most important # Number of trees in random forest n_estimators = [int(a) for a in np.linspace(start = 200, stop = 2000, num = 10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(a) for a in np.linspace(10, 110, num = 11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid print("Search grid:") random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} pprint(random_grid) # In[15]: np.linspace(start = 200, stop = 2000, num = 10) # In[16]: x=x.reshape(x.shape[0],1) y=y.reshape(y.shape[0],1) # In[17]: rf = RandomForestRegressor() rf.fit(x, y) z = rf.predict(x) mychart(x, y, z) # In[18]: # Use the random grid to search for best hyperparameters # First create the base model to tune rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = 1) # Fit the random search model rf_random.fit(x, y) # In[20]: rf_random.best_params_ # In[23]: rf = RandomForestRegressor(bootstrap=False, n_estimators=1200, max_features='sqrt', max_depth=10, min_samples_leaf=2, min_samples_split=5, ) rf.fit(x, y) z = rf.predict(x) mychart(x, y, z) # In[21]: rf = RandomForestRegressor(n_estimators=600, max_features='auto', max_depth=None, min_samples_leaf=4, min_samples_split=2, ) rf.fit(x, y) z = rf.predict(x) mychart(x, y, z) # In[24]: from scipy.interpolate import UnivariateSpline spl = UnivariateSpline(x, y) z = spl(x) mychart(x, y, z) # In[28]: spl.set_smoothing_factor(101.0) #print(spl.get_knots()) z = spl(x) mychart(x, y, z) # In[26]: def spline_cv_test(smoothing_factor): kf = KFold(n_splits=5) errors = [] for train_index, test_index in kf.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] spline = UnivariateSpline(x_train, y_train) spline.set_smoothing_factor(smoothing_factor) y_pred_test = spline(x_test) errors.append(mean_squared_error(y_test, y_pred_test)) return np.mean(np.array(errors)) print(spline_cv_test(49.856)) for sf in np.linspace(10, 110, num=101): print(sf, spline_cv_test(sf)) # In[29]: from xgboost import XGBRegressor from sklearn.model_selection import RandomizedSearchCV import scipy.stats as st xgbreg = XGBRegressor(nthreads=1) one_to_left = st.beta(10, 1) from_zero_positive = st.expon(0, 50) param_grid = { 'silent': [False], 'max_depth': [6, 10, 15, 20], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3], 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], 'gamma': [0, 0.25, 0.5, 1.0], 'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0], 'n_estimators': [100]} params = { "n_estimators": [10, 20, 40, 80, 160, 320, 640], "max_depth": [4, 8, 16, 32, 64, 128], "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50], "colsample_bytree": [1.0, 0.99, 0.95, 0.90, 0.85, 0.80, 0.60], "subsample": [1.0, 0.99, 0.95, 0.90, 0.85, 0.80, 0.60], "gamma": [0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], 'reg_alpha': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], "min_child_weight": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], } gs = RandomizedSearchCV(xgbreg, params, n_jobs=1) gs.fit(x, y) gs.best_params_ # In[30]: xgbreg = XGBRegressor(nthreads=1, colsample_bytree=0.6, gamma=3.0, learning_rate=0.05, max_depth=64, min_child_weight=90, n_estimators=10, reg_alpha=100, subsample=0.99) xgbreg.fit(x, y) z = xgbreg.predict(x) mychart(x, y, z) # In[31]: #LSTM/GRU def generator_factory(data, # Xs lookback, # sequence length min_index=0, # start index max_index=None, # max index for e.g. train/xval/text, if None use all of X shuffle=False, # if True, just repeatedly sample randomly batch_size=128): # how many (predictors, responses) tuples to generate if max_index == None: max_index = data.shape[0] - 2 # last row of predictors that has a y to predict first_row = min_index + lookback # first row of responses to return i = first_row while True: # keep generating forever if shuffle: # pick rows at random rows = np.random.choice(range(first_row,max_index), size=batch_size, replace=False) else: if i + batch_size >= max_index: # start from beginning i = first_row rows = np.arange(i, i + batch_size) predictors = np.zeros((len(rows), lookback, data.shape[-1])) responses = np.zeros((len(rows), data.shape[-1])) for j, row in enumerate(rows): indices = range(rows[j] - lookback, rows[j]) predictors[j] = data[indices] responses[j] = data[rows[j]] yield predictors, [responses[:,0], responses[:,1]] # In[32]: y2 = y2.reshape(y2.shape[0],1) y1 = y1.reshape(y1.shape[0],1) X = np.hstack((y1,y2)) X.shape mygen = generator_factory(X, 10, batch_size=16) next(mygen) # In[33]: OUTPUT_DIM = 2 def build_model(hidden_layers = [[16, 0.0001, 0.2], [16, 0.0001, 0.2], [1, 0.0, 0.0]]): """Take list of [layer_size, reg_penalty, dropout], last layer is linear, rest LSTM""" main_input = Input(shape=(None, OUTPUT_DIM), dtype='float32', name='main_input') lastlayer=main_input n_lstms = len(hidden_layers)-1 for i in range(n_lstms): layer_size, reg_penalty, dropout = hidden_layers[i] # first n-1 layers are LSTM, return_sequences=True return_sequences = True if i == n_lstms-1: return_sequences= False LSTM_layer = GRU(layer_size, return_sequences=return_sequences, kernel_regularizer=keras.regularizers.l1(reg_penalty), dropout=dropout, recurrent_dropout=dropout, name = "GRU%02d" % i )(lastlayer) lastlayer = LSTM_layer layer_size, reg_penalty, dropout = hidden_layers[-1] if dropout: lastlayer = Dropout(dropout)(lastlayer) outputs = [] for i in range(OUTPUT_DIM): # OUTPUT_DIM outputs output01 = Dense(1, activation='linear', kernel_regularizer=keras.regularizers.l1(reg_penalty), name='output%02d' % i)(lastlayer) outputs.append(output01) model = Model(inputs=[main_input], outputs=outputs) print(model.summary()) model.compile(loss="mae", metrics=['mae'], optimizer="rmsprop", loss_weights=[1.]*OUTPUT_DIM) return model # In[34]: model = build_model() # In[35]: LOOKBACK = 128 BATCH_SIZE = 64 mygen = generator_factory(X, LOOKBACK, batch_size=BATCH_SIZE, shuffle=True) history = model.fit_generator(mygen, steps_per_epoch=32, epochs=40, ) # In[36]: model.predict(X[:128].reshape(1,LOOKBACK,OUTPUT_DIM)) # In[37]: X[129] # In[38]: # predict using X X_pred = np.zeros_like(X) # copy first 128 rows to X_pred for i in range(LOOKBACK): X_pred[i]=X[i] # forecast remainder of rows using X for i in range(LOOKBACK, X.shape[0]): predlist = model.predict(X[i-LOOKBACK:i].reshape(1,LOOKBACK,OUTPUT_DIM)) X_pred[i] = np.array([predlist[0][0][0],predlist[1][0][0]]) # In[39]: from sklearn.metrics import mean_absolute_error print("%.6f" % mean_absolute_error(y1, X_pred[:,0])) print("%.6f" % mean_absolute_error(y2, X_pred[:,1])) # In[40]: mychart(x2,X_pred[:,0],X_pred[:,1]) # In[41]: # predict recursively after bootstrapping first 128 rows of X X_pred = np.zeros_like(X) # copy first 128 rows to X_pred for i in range(LOOKBACK): X_pred[i]=X[i] # recursively forecast remainder of rows using X_pred for i in range(LOOKBACK, X.shape[0]): predlist = model.predict(X_pred[i-LOOKBACK:i].reshape(1,LOOKBACK,OUTPUT_DIM)) X_pred[i] = np.array([predlist[0][0][0],predlist[1][0][0]]) # In[42]: mychart(x2,X_pred[:,0],X_pred[:,1])