import numpy as np import pods data = pods.datasets.olympic_marathon_men() x = data['X'] y = data['Y'] offset = y.mean() scale = np.sqrt(y.var()) import matplotlib.pyplot as plt import teaching_plots as plot import mlai xlim = (1875,2030) ylim = (2.5, 6.5) yhat = (y-offset)/scale fig, ax = plt.subplots(figsize=plot.big_wide_figsize) _ = ax.plot(x, y, 'r.',markersize=10) ax.set_xlabel('year', fontsize=20) ax.set_ylabel('pace min/km', fontsize=20) ax.set_xlim(xlim) ax.set_ylim(ylim) mlai.write_figure(figure=fig, filename='../slides/diagrams/datasets/olympic-marathon.svg', transparent=True, frameon=True) pods.notebook.display_plots('over_determined_system{samp:0>3}.svg', directory='../slides/diagrams/ml', samp=IntSlider(1,1,7,1)) import pods pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA2') import pods pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA4') import pods pods.notebook.display_google_book(id='spcAAAAAMAAJ', page='PA72') import pods data = pods.datasets.olympic_marathon_men() x = data['X'] y = data['Y'] print(x) print(y) m = -0.4 c = 80 # set c to the minimum c = (y - m*x).mean() print(c) m = ((y - c)*x).sum()/(x**2).sum() print(m) import numpy as np x_test = np.linspace(1890, 2020, 130)[:, None] f_test = m*x_test + c for i in np.arange(10): m = ((y - c)*x).sum()/(x*x).sum() c = (y-m*x).sum()/y.shape[0] print(m) print(c) f_test = m*x_test + c plt.plot(x_test, f_test, 'b-') plt.plot(x, y, 'rx') # Write code for your answer to Question 3 in this box # provide the answers so that the code runs correctly otherwise you will loose marks! import numpy as np import matplotlib.pyplot as plt import mlai x = np.random.normal(size=4) m_true = 1.4 c_true = -3.1 y = m_true*x+c_true noise = np.random.normal(scale=0.5, size=4) # standard deviation of the noise is 0.5 y = m_true*x + c_true + noise plt.plot(x, y, 'r.', markersize=10) plt.xlim([-3, 3]) mlai.write_figure(filename="../slides/diagrams/ml/regression_noise.svg", transparent=True) # create an array of linearly separated values around m_true m_vals = np.linspace(m_true-3, m_true+3, 100) # create an array of linearly separated values ae c_vals = np.linspace(c_true-3, c_true+3, 100) m_grid, c_grid = np.meshgrid(m_vals, c_vals) E_grid = np.zeros((100, 100)) for i in range(100): for j in range(100): E_grid[i, j] = ((y - m_grid[i, j]*x - c_grid[i, j])**2).sum() %load -s regression_contour teaching_plots.py m_star = 0.0 c_star = -5.0 $$\frac{\text{d}\errorFunction(m, c)}{\text{d} c} = -2\sum_{i=1}^\numData (\dataScalar_i - m\inputScalar_i - c)$$ c_grad = -2*(y-m_star*x - c_star).sum() print("Gradient with respect to c is ", c_grad) m_grad = -2*(x*(y-m_star*x - c_star)).sum() print("Gradient with respect to m is ", m_grad) print("Original m was", m_star, "and original c was", c_star) learn_rate = 0.01 c_star = c_star - learn_rate*c_grad m_star = m_star - learn_rate*m_grad print("New m is", m_star, "and new c is", c_star) num_plots = plot.regression_contour_fit(x, y, diagrams='../slides/diagrams/ml') import pods from ipywidgets import IntSlider pods.notebook.display_plots('regression_contour_fit{num:0>3}.svg', directory='../slides/diagrams/ml', num=IntSlider(0, 0, num_plots, 1)) $$\frac{\text{d}\errorFunction(m, c)}{\text{d} m} = -2\sum_{i=1}^\numData \inputScalar_i(\dataScalar_i - m\inputScalar_i - c)$$ but it has $\numData$ terms in the sum. Substituting in the gradient we can see that the full update is of the form $$m_\text{new} \leftarrow m_\text{old} + 2\learnRate \left[\inputScalar_1 (\dataScalar_1 - m_\text{old}\inputScalar_1 - c_\text{old}) + (\inputScalar_2 (\dataScalar_2 - m_\text{old}\inputScalar_2 - c_\text{old}) + \dots + (\inputScalar_n (\dataScalar_n - m_\text{old}\inputScalar_n - c_\text{old})\right]$$ This could be split up into lots of individual updates $$m_1 \leftarrow m_\text{old} + 2\learnRate \left[\inputScalar_1 (\dataScalar_1 - m_\text{old}\inputScalar_1 - c_\text{old})\right]$$ $$m_2 \leftarrow m_1 + 2\learnRate \left[\inputScalar_2 (\dataScalar_2 - m_\text{old}\inputScalar_2 - c_\text{old})\right]$$ $$m_3 \leftarrow m_2 + 2\learnRate \left[\dots\right]$$ $$m_n \leftarrow m_{n-1} + 2\learnRate \left[\inputScalar_n (\dataScalar_n - m_\text{old}\inputScalar_n - c_\text{old})\right]$$ # choose a random point for the update i = np.random.randint(x.shape[0]-1) # update m m_star = m_star + 2*learn_rate*(x[i]*(y[i]-m_star*x[i] - c_star)) # update c c_star = c_star + 2*learn_rate*(y[i]-m_star*x[i] - c_star) import pods from ipywidgets import IntSlider pods.notebook.display_plots('regression_sgd_contour_fit{num:0>3}.svg', directory='../slides/diagrams/mlai', num=IntSlider(0, 0, num_plots, 1)) import numpy as np # define the vector w w = np.zeros(shape=(2, 1)) w[0] = m w[1] = c import numpy as np X = np.hstack((np.ones_like(x), x)) print(X) import numpy as np f = np.dot(X, w) # np.dot does matrix multiplication in python resid = (y-f) E = np.dot(resid.T, resid) # matrix multiplication on a single vector is equivalent to a dot product. print("Error function is:", E) import numpy as np np.linalg.solve? w = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y)) print(w) import pods data = pods.datasets.movie_body_count() movies = data['Y'] print(', '.join(movies.columns)) select_features = ['Year', 'Body_Count', 'Length_Minutes'] X = movies[select_features] X['Eins'] = 1 # add a column for the offset y = movies[['IMDB_Rating']] import pandas as pd w = pd.DataFrame(data=np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y)), # solve linear regression here index = X.columns, # columns of X become rows of w columns=['regression_coefficient']) # the column of X is the value of regression coefficient (y - np.dot(X, w)).hist() w from IPython.lib.display import YouTubeVideo YouTubeVideo('ui-uNlFHoms') from IPython.lib.display import YouTubeVideo YouTubeVideo('78YNphT90-k') import scipy as sp Q, R = np.linalg.qr(X) w = sp.linalg.solve_triangular(R, np.dot(Q.T, y)) w = pd.DataFrame(w, index=X.columns) w