First, we set up HaskellR.
:ext QuasiQuotes
import qualified H.Prelude as H
H.initialize H.defaultConfig
Then, we set up R, loading all the packages we need.
[r|
library(keras)
library(dplyr)
library(ggplot2)
library(lubridate)
library(tidyr)
library(zoo)
library(forecast)
library(xts) |]
Let's assume our data is available as part of a larger trading application, written in Haskell.
Here we just load stock market data from files.
getAsDouble :: String -> [Double]
getAsDouble = map read . lines
googl <- fmap getAsDouble (readFile "googl.csv")
intl <- fmap getAsDouble (readFile "intl.csv")
nvda <- fmap getAsDouble (readFile "nvda.csv")
dates <- fmap lines (readFile "dates.csv")
Now we'd like to visualize our data (as time series, preferredly) and do some exploration.
How? Well - we do what we always do - use ggplot2...
The data shown are stock returns (= relative price differences). We compare returns for Google, Intel and Nvidia starting from 2017-01-01.
[rgraph|
df <- data.frame(tstamp = ymd(dates_hs), googl = googl_hs, intl = intl_hs, nvda = nvda_hs)
tss <<- read.zoo(df)
autoplot(tss) + facet_free() |]
We can quickly look at (auto-) correlations...
[rgraph|
acf(tss, na.action = na.pass) |]
And sure, we can quickly do an auto.arima on one of the series!
[rgraph|
intl_ts <- as.xts(tss[,2])
fit <- auto.arima(intl_ts)
fc <- forecast(fit, h=7)
plot(fc)|]
We need a TensorFlow installation for this, so let's check if R can find it!
[rprint| reticulate::py_config() |]
First, we prepare our time series so it's in the correct shape for an LSTM with 10 timesteps...
[rprint|
lstm_num_timesteps <<- 7
intl <<- unclass(tss[,2])
# difference
intl_start <- intl[1]
intl_diff <- diff(intl)
# normalize
minval <<- min(intl_diff)
maxval <<- max(intl_diff)
normalize <- function(vec, min, max) {
(vec-min) / (max-min)
}
denormalize <<- function(vec,min,max) {
vec * (max - min) + min
}
intl_diff <- normalize(intl_diff, minval, maxval)
# create timesteps
X_train <<- t(sapply(1:(length(intl_diff) - lstm_num_timesteps), function(x) intl_diff[x:(x + lstm_num_timesteps - 1)]))
y_train <<- sapply((lstm_num_timesteps + 1):(length(intl_diff)), function(x) intl_diff[x])
# Keras LSTMs expect the input array to be shaped as (no. samples, no. time steps, no. features)
dim(X_train) <<- c(dim(X_train)[1], dim(X_train)[2], 1)
num_samples <- dim(X_train)[1]
num_steps <<- dim(X_train)[2]
num_features <<- dim(X_train)[3]
c(num_samples, num_steps, num_features) |]
Now, we create the model!
-- at this point, there is no model yet
[rprint| model |]
[rprint|
batch_size <<- 1
epochs <<- 20
lstm_units <<- 4
model <<- keras_model_sequential()
|]
[rprint|
model %>%
layer_lstm(units = lstm_units, input_shape = c(num_steps, num_features)) %>%
layer_dense(units = 1) %>%
compile(
loss = 'mean_squared_error',
optimizer = 'adam'
)
model %>% summary()
|]
... and we train it!
[rprint|
model %>% fit(X_train, y_train, batch_size = batch_size, epochs = epochs)
# model %>% save_model_hdf5(filepath = paste0(model_name, ".h5"))
|]
Let's see how well the model predicts the data (of course, in reality we'd have a test series)!
[rgraph|
pred_train <- model %>% predict(X_train, batch_size = 1)
pred_train <- denormalize(pred_train, minval, maxval)
pred_train_undiff <- pred_train + intl[(lstm_num_timesteps+1):(length(intl)-1)]
c(length(intl), length(pred_train))
df <- data_frame(time_id = 1:113,
train = intl,
pred_train = c(rep(NA, lstm_num_timesteps+1), pred_train_undiff))
df <- df %>% gather(key = 'type', value = 'value', train:pred_train)
ggplot(df, aes(x = time_id, y = value)) + geom_line(aes(color = type)) + theme(aspect.ratio=0.8)
|]
Not bad for such a short training time, is it? :-)