Linear Regression from scratch¶

In [1]:

!pip3 install plotly

Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (4.5.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from plotly) (1.14.0)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly) (1.3.3)

In [2]:

import re
from pathlib import Path
from typing import Union, List
from plotly import express as px
from plotly import graph_objects as go

In [3]:

# Ensure that we have a `data` directory we use to store downloaded data
!mkdir -p data
data_dir: Path = Path('data')

In [4]:

# Downloading the "Auto Insurance in Sweden" data set
!wget -nc -P data https://www.math.muni.cz/~kolacek/docs/frvs/M7222/data/AutoInsurSweden.txt

File ‘data/AutoInsurSweden.txt’ already there; not retrieving.

In [5]:

!head -n 20 data/AutoInsurSweden.txt

Auto Insurance in Sweden

In the following data
X = number of claims
Y = total payment for all the claims in thousands of Swedish Kronor
for geographical zones in Sweden
Reference: Swedish Committee on Analysis of Risk Premium in Motor Insurance
http://college.hmco.com/mathematics/brase/understandable_statistics/7e/students/datasets/
       slr/frames/frame.html

X	Y
108	392,5
19	46,2
13	15,7
124	422,2
40	119,4
57	170,9
23	56,9
14	77,5
45	214

In [6]:

# Create the Python path pointing to the `AutoInsurSweden.txt` file
insurance_data_path: Path = data_dir / 'AutoInsurSweden.txt'

In [7]:

# Read the `AutoInsurSweden.txt` file, extract the `x` and `y` values via regex and store them into vectors
xs: List[float] = []
ys: List[float] = []

with open(insurance_data_path) as file:
    content: str = file.read()
    for x, y in re.findall(r'([\d,]+)\t([\d,]+)', content):
        xs.append(float(x.replace(',', '.')))
        ys.append(float(y.replace(',', '.')))

In [8]:

# A convenience function which creates a scatter plot with an optional line
def plot(xs: List[float], ys: List[float], ys_pred: Union[List[float], None] = None) -> None:
    fig = px.scatter(x=xs, y=ys, labels={'x': 'Number of claims', 'y': 'Total payment'})
    # If present, add the line
    if ys_pred:
        fig.add_trace(
            go.Scatter(
                x=xs, y=ys_pred, name='Guess'
            )
        )
    fig.show()

In [9]:

plot(xs, ys)

In [10]:

# The linear function which describes a line
# Our goal is to find `m` and `b` such that the line most accurately "describes" the insurance data points
def predict(m: float, b: float, x: float) -> float:
    return m * x + b

assert predict(m=0, b=0, x=3) == 0

In [11]:

# SSE (sum of squared estimate of errors), the function we use to calculate how "wrong" we are
# "How much do the actual y values (`ys`) differ from our predicted y values (`ys_pred`)?"
def sum_squared_error(ys: List[float], ys_pred: List[float]) -> float:
    assert len(ys) == len(ys_pred)
    return sum([(y - ys_pred) ** 2 for y, ys_pred in zip(ys, ys_pred)])

assert sum_squared_error([1, 2, 3], [4, 5, 6]) == 27

In [12]:

# Our initial guess as to what `m` and `b` might be
m: float = 0
b: float = 200

# Predicting the y values based on our initial guess for the line
ys_pred: List[float] = [predict(m, b, x) for x in xs]

# Visualize the result
plot(xs, ys, ys_pred)

print(f'Initial guess for "m": {m}')
print(f'Initial guess for "b": {b}')

# Calculate how "off" we are via SSE
loss: float = sum_squared_error(ys, ys_pred)
print(f'SSE: {loss}')

Initial guess for "m": 0
Initial guess for "b": 200
SSE: 1125865.2999999996

In [13]:

# Find the best fitting line through the data points via Gradient Descent
m: float = 0
b: float = 200

print(f'Starting with "m": {m}')
print(f'Starting with "b": {b}')

epochs: int = 10000
learning_rate: float = 0.00001

for epoch in range(epochs):
    # Calculate predictions for `y` values given the current `m` and `b`
    ys_pred: List[float] = [predict(m, b, x) for x in xs]

    # Calculate and print the error
    if epoch % 1000 == True:
        loss: float = sum_squared_error(ys, ys_pred)
        print(f'Epoch {epoch} --> loss: {loss}')

    # Calculate the gradient
    # Taking the (partial) derivative of SSE with respect to `m` results in `2 * x ((m * x + b) - y)`
    grad_m: float = sum([2 * (predict(m, b, x) - y) * x for x, y in zip(xs, ys)])
    # Taking the (partial) derivative of SSE with respect to `b` results in `2 ((m * x + b) - y)`
    grad_b: float = sum([2 * (predict(m, b, x) - y) for x, y in zip(xs, ys)])
    
    # Take a small step in the direction of greatest decrease
    m = m + (grad_m * -learning_rate)
    b = b + (grad_b * -learning_rate)

print(f'Best estimate for "m": {m}')
print(f'Best estimate for "b": {b}')

plot(xs, ys, ys_pred)

Starting with "m": 0
Starting with "b": 200
Epoch 1 --> loss: 1111304.0949169993
Epoch 1001 --> loss: 367095.40067246475
Epoch 2001 --> loss: 159429.33008833785
Epoch 3001 --> loss: 101348.4050032304
Epoch 4001 --> loss: 85104.08618286082
Epoch 5001 --> loss: 80560.80637884357
Epoch 6001 --> loss: 79290.12266438038
Epoch 7001 --> loss: 78934.73246790287
Epoch 8001 --> loss: 78835.33543438878
Epoch 9001 --> loss: 78807.53565158854
Best estimate for "m": 3.4071723383619705
Best estimate for "b": 20.302521479691976