#!/usr/bin/env python # coding: utf-8 # # Lineare Regression # # This notebook implements linear regression using gradient descent as taught in Week 1 of [Coursera's Machine Learning course](https://www.coursera.org/learn/machine-learning/#syllabus). # # The course doesn't explicitly go into the implementation yet, so I'm not sure how to pick a good alpha (learning rate) nor do I know a good way to determine the number of iterations the algorithm should run. # # I decided to attempt a quick implementation in Python anyway in order to improve my understanding. # In[1]: import pandas as pd import numpy as np # In[2]: df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [1, 3, 5, 7, 9, 11]}) df # In[3]: def hypothesis(theta0, theta1, x): return theta0 + theta1 * x def linear_regression(df, alpha, iterations): theta0 = 0 theta1 = 0 m = len(df) for _ in range(0, iterations): newTheta0 = 0 newTheta1 = 0 for i in range (0, m): error = hypothesis(theta0, theta1, df.x[i]) - df.y[i] newTheta0 = newTheta0 + error newTheta1 = newTheta1 + error * df.x[i] newTheta0 = theta0 - alpha / m * newTheta0 newTheta1 = theta1 - alpha / m * newTheta1 theta0 = newTheta0 theta1 = newTheta1 return [theta0, theta1] # In[4]: get_ipython().run_cell_magic('time', '', 'linear_regression(df, 0.1, 1000)\n') # The implementation above can be made more efficient by replacing the inner loop with matrix operations instead. # # Below is my attempt at doing so: # In[5]: def vector_linear_regression(df, alpha, iterations): m = len(df) thetas = np.zeros((2,1)) X = pd.DataFrame(data={0: 1, 1: df.x}).as_matrix() # Is there a more elegant way to build this matrix? y = np.array([df.y]).transpose() op = X.transpose() for _ in range(0, iterations): errors = X.dot(thetas) - y sums = op.dot(errors) thetas = thetas - sums * alpha / m return [thetas[0][0], thetas[1][0]] # In[6]: get_ipython().run_cell_magic('time', '', 'vector_linear_regression(df, 0.1, 1000)\n') # In[7]: df.as_matrix()