import numpy as np
import pods

data = pods.datasets.olympic_marathon_men()
x = data['X']
y = data['Y']

offset = y.mean()
scale = np.sqrt(y.var())

import matplotlib.pyplot as plt
import teaching_plots as plot
import mlai


xlim = (1875,2030)
ylim = (2.5, 6.5)
yhat = (y-offset)/scale

fig, ax = plt.subplots(figsize=plot.big_wide_figsize)
_ = ax.plot(x, y, 'r.',markersize=10)
ax.set_xlabel('year', fontsize=20)
ax.set_ylabel('pace min/km', fontsize=20)
ax.set_xlim(xlim)
ax.set_ylim(ylim)

mlai.write_figure(figure=fig, 
                  filename='../slides/diagrams/datasets/olympic-marathon.svg', 
                  transparent=True, 
                  frameon=True)

pods.notebook.display_plots('over_determined_system{samp:0>3}.svg',
                            directory='../slides/diagrams/ml', 
                            samp=IntSlider(1,1,7,1))

import pods
pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA2')

import pods
pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA4')

import pods
pods.notebook.display_google_book(id='spcAAAAAMAAJ', page='PA72')

import pods

data = pods.datasets.olympic_marathon_men()
x = data['X']
y = data['Y']

print(x)
print(y)

m = -0.4
c = 80

# set c to the minimum
c = (y - m*x).mean()
print(c)

m = ((y - c)*x).sum()/(x**2).sum()
print(m)

import numpy as np

x_test = np.linspace(1890, 2020, 130)[:, None]

f_test = m*x_test + c

for i in np.arange(10):
    m = ((y - c)*x).sum()/(x*x).sum()
    c = (y-m*x).sum()/y.shape[0]
print(m)
print(c)

f_test = m*x_test + c
plt.plot(x_test, f_test, 'b-')
plt.plot(x, y, 'rx')

# Write code for your answer to Question 3 in this box
# provide the answers so that the code runs correctly otherwise you will loose marks!


import numpy as np
import matplotlib.pyplot as plt
import mlai

x = np.random.normal(size=4)

m_true = 1.4
c_true = -3.1

y = m_true*x+c_true

noise = np.random.normal(scale=0.5, size=4) # standard deviation of the noise is 0.5
y = m_true*x + c_true + noise
plt.plot(x, y, 'r.', markersize=10)
plt.xlim([-3, 3])
mlai.write_figure(filename="../slides/diagrams/ml/regression_noise.svg", transparent=True)

# create an array of linearly separated values around m_true
m_vals = np.linspace(m_true-3, m_true+3, 100) 
# create an array of linearly separated values ae
c_vals = np.linspace(c_true-3, c_true+3, 100)

m_grid, c_grid = np.meshgrid(m_vals, c_vals)

E_grid = np.zeros((100, 100))
for i in range(100):
    for j in range(100):
        E_grid[i, j] = ((y - m_grid[i, j]*x - c_grid[i, j])**2).sum()

%load -s regression_contour teaching_plots.py

m_star = 0.0
c_star = -5.0

$$\frac{\text{d}\errorFunction(m, c)}{\text{d} c} =
-2\sum_{i=1}^\numData (\dataScalar_i - m\inputScalar_i - c)$$

c_grad = -2*(y-m_star*x - c_star).sum()
print("Gradient with respect to c is ", c_grad)

m_grad = -2*(x*(y-m_star*x - c_star)).sum()
print("Gradient with respect to m is ", m_grad)

print("Original m was", m_star, "and original c was", c_star)
learn_rate = 0.01
c_star = c_star - learn_rate*c_grad
m_star = m_star - learn_rate*m_grad
print("New m is", m_star, "and new c is", c_star)

num_plots = plot.regression_contour_fit(x, y, diagrams='../slides/diagrams/ml')

import pods
from ipywidgets import IntSlider

pods.notebook.display_plots('regression_contour_fit{num:0>3}.svg', directory='../slides/diagrams/ml', num=IntSlider(0, 0, num_plots, 1))

$$\frac{\text{d}\errorFunction(m, c)}{\text{d} m} = -2\sum_{i=1}^\numData \inputScalar_i(\dataScalar_i -
m\inputScalar_i - c)$$

but it has $\numData$ terms in the sum. Substituting in the gradient
we can see that the full update is of the form

$$m_\text{new} \leftarrow
m_\text{old} + 2\learnRate \left[\inputScalar_1 (\dataScalar_1 - m_\text{old}\inputScalar_1 - c_\text{old}) + (\inputScalar_2 (\dataScalar_2 -   m_\text{old}\inputScalar_2 - c_\text{old}) + \dots + (\inputScalar_n (\dataScalar_n - m_\text{old}\inputScalar_n - c_\text{old})\right]$$

This could be split up into lots of individual updates
$$m_1 \leftarrow m_\text{old} + 2\learnRate \left[\inputScalar_1 (\dataScalar_1 - m_\text{old}\inputScalar_1 -
c_\text{old})\right]$$
$$m_2 \leftarrow m_1 + 2\learnRate \left[\inputScalar_2 (\dataScalar_2 -
m_\text{old}\inputScalar_2 - c_\text{old})\right]$$
$$m_3 \leftarrow m_2 + 2\learnRate
\left[\dots\right]$$
$$m_n \leftarrow m_{n-1} + 2\learnRate \left[\inputScalar_n (\dataScalar_n -
m_\text{old}\inputScalar_n - c_\text{old})\right]$$

# choose a random point for the update 
i = np.random.randint(x.shape[0]-1)
# update m
m_star = m_star + 2*learn_rate*(x[i]*(y[i]-m_star*x[i] - c_star))
# update c
c_star = c_star + 2*learn_rate*(y[i]-m_star*x[i] - c_star)

import pods
from ipywidgets import IntSlider

pods.notebook.display_plots('regression_sgd_contour_fit{num:0>3}.svg', 
    directory='../slides/diagrams/mlai', num=IntSlider(0, 0, num_plots, 1))

import numpy as np

# define the vector w
w = np.zeros(shape=(2, 1))
w[0] = m
w[1] = c

import numpy as np

X = np.hstack((np.ones_like(x), x))
print(X)

import numpy as np

f = np.dot(X, w) # np.dot does matrix multiplication in python

resid = (y-f)
E = np.dot(resid.T, resid) # matrix multiplication on a single vector is equivalent to a dot product.
print("Error function is:", E)

import numpy as np

np.linalg.solve?

w = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y))
print(w)

import pods

data = pods.datasets.movie_body_count()
movies = data['Y']

print(', '.join(movies.columns))

select_features = ['Year', 'Body_Count', 'Length_Minutes']
X = movies[select_features]
X['Eins'] = 1 # add a column for the offset
y = movies[['IMDB_Rating']]

import pandas as pd

w = pd.DataFrame(data=np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y)),  # solve linear regression here
                 index = X.columns,  # columns of X become rows of w
                 columns=['regression_coefficient']) # the column of X is the value of regression coefficient

(y - np.dot(X, w)).hist()

w

from IPython.lib.display import YouTubeVideo
YouTubeVideo('ui-uNlFHoms')

from IPython.lib.display import YouTubeVideo
YouTubeVideo('78YNphT90-k')

import scipy as sp

Q, R = np.linalg.qr(X)
w = sp.linalg.solve_triangular(R, np.dot(Q.T, y)) 
w = pd.DataFrame(w, index=X.columns)
w