#!/usr/bin/env python
# coding: utf-8

# # Lineare Regression
# 
# This notebook implements linear regression using gradient descent as taught in Week 1 of [Coursera's Machine Learning course](https://www.coursera.org/learn/machine-learning/#syllabus).
# 
# The course doesn't explicitly go into the implementation yet, so I'm not sure how to pick a good alpha (learning rate) nor do I know a good way to determine the number of iterations the algorithm should run.
# 
# I decided to attempt a quick implementation in Python anyway in order to improve my understanding.

# In[1]:


import pandas as pd
import numpy as np


# In[2]:


df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [1, 3, 5, 7, 9, 11]})
df


# In[3]:


def hypothesis(theta0, theta1, x):
    return theta0 + theta1 * x

def linear_regression(df, alpha, iterations):
    theta0 = 0
    theta1 = 0
    m = len(df)
    for _ in range(0, iterations):
        newTheta0 = 0
        newTheta1 = 0
        for i in range (0, m):
            error = hypothesis(theta0, theta1, df.x[i]) - df.y[i]
            newTheta0 = newTheta0 + error
            newTheta1 = newTheta1 + error * df.x[i]
        newTheta0 = theta0 - alpha / m * newTheta0
        newTheta1 = theta1 - alpha / m * newTheta1
        theta0 = newTheta0
        theta1 = newTheta1
    return [theta0, theta1]


# In[4]:


get_ipython().run_cell_magic('time', '', 'linear_regression(df, 0.1, 1000)\n')


# The implementation above can be made more efficient by replacing the inner loop with matrix operations instead.
# 
# Below is my attempt at doing so:

# In[5]:


def vector_linear_regression(df, alpha, iterations):
    m = len(df)
    thetas = np.zeros((2,1))
    X = pd.DataFrame(data={0: 1, 1: df.x}).as_matrix() # Is there a more elegant way to build this matrix?
    y = np.array([df.y]).transpose()
    op = X.transpose()
    for _ in range(0, iterations):
        errors = X.dot(thetas) - y
        sums = op.dot(errors)
        thetas = thetas - sums * alpha / m
    return [thetas[0][0], thetas[1][0]]


# In[6]:


get_ipython().run_cell_magic('time', '', 'vector_linear_regression(df, 0.1, 1000)\n')


# In[7]:


df.as_matrix()