#!/usr/bin/env python
# coding: utf-8

# # Exercise 5.3: Neural Networks in Keras

# In[1]:


import numpy as np
import matplotlib.pyplot as plt

# See https://keras.io/
# for extennsive documentation
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense


# Let us visit the problem of wine quality prediction previ- ously encountered in Exercises 3.2 and 4.1 one final time. After linear regression and a self-made network, we can now explore the comfort provided by the Keras library.

# In[ ]:


# The code snippet below is responsible for downloading the dataset to
# Google. You can directly download the file using the link
# if you work with a local anaconda setup
get_ipython().system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv')


# In[ ]:


# load all examples from the file
data = np.genfromtxt('winequality-white.csv',delimiter=";",skip_header=1)

print("data:", data.shape)

# Prepare for proper training
np.random.shuffle(data) # randomly sort examples

# take the first 3000 examples for training
X_train = data[:3000,:11] # all features except last column
y_train = data[:3000,11]  # quality column

# and the remaining examples for testing
X_test = data[3000:,:11] # all features except last column
y_test = data[3000:,11] # quality column

print("First example:")
print("Features:", X_train[0])
print("Quality:", y_train[0])


# Below is the simple network from exercise 4.1 implemented using Keras. In addition to the network we define the loss function and optimiser.

# In[ ]:


# See: https://keras.io/api/models/sequential/ and 
# https://keras.io/api/layers/core_layers/dense/
# We can use the Sequential class to very easiliy
# build a simple architecture
model = Sequential()
# 11 inputs, 20 outputs, relu
model.add(Dense(20, input_dim=11, activation='relu')) 
# 20 inputs (automatically detected by Keras), 1 output, linear activation
model.add(Dense(1, activation='linear'))


# Set loss function and optimiser algorithm
model.compile(loss='mse',  # mean squared error
              optimizer='sgd'# stochastic gradient descent
             ) 


# # Training and evaluation below
# 
# The code below trains the network for 5 epochs using the loss function and optimiser defined above. Each example is individually passed to the network

# In[ ]:


history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),
                    epochs=5, batch_size=1)


# In[ ]:


# The history object returned by the model training above 
# contains the values of the loss function (the mean-squared-error)
# at different epochs
# We discard the first epoch as the loss value is very high,
# obscuring the rest of the distribution
train_loss = history.history["loss"][1:]
test_loss = history.history["val_loss"][1:]


# In[ ]:


# Prepare and plot loss over time
plt.plot(train_loss,label="train")
plt.plot(test_loss,label="test")
plt.legend()
plt.xlabel("Epoch-1")
plt.ylabel("Loss")
plt.show()


# In[ ]:


# After the training:

# Prepare scatter plot
y_pred = model.predict(X_test)[:,0]

print("Correlation coefficient:", np.corrcoef(y_pred,y_test)[0,1])
plt.scatter(y_pred,y_test)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


# In[ ]:


np.corrcoef(y_pred,y_test)


# 
# # Problems
# 
# * Use the notebook as starting point. It already contains the simple network from Exercise 4.1 implemented in Keras.
# 
# * Currently, SGD is used without momentum. Try training with a momentum term. Replace SGD with the Adam optimizer and train using that. (See: https://keras.io/api/optimizers/)
# * Add two more hidden layers to the network (you can choose the number of nodes but make sure to apply the ReLu activation function after each) and train again.
# * Test differet numbers of examples (i.e. change the batch batch size) to be simulataneously used by the network. 

# In[ ]: