!pip3 install plotly
Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (4.5.0) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from plotly) (1.14.0) Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly) (1.3.3)
import re
from pathlib import Path
from typing import Union, List
from plotly import express as px
from plotly import graph_objects as go
# Ensure that we have a `data` directory we use to store downloaded data
!mkdir -p data
data_dir: Path = Path('data')
# Downloading the "Auto Insurance in Sweden" data set
!wget -nc -P data https://www.math.muni.cz/~kolacek/docs/frvs/M7222/data/AutoInsurSweden.txt
File ‘data/AutoInsurSweden.txt’ already there; not retrieving.
!head -n 20 data/AutoInsurSweden.txt
Auto Insurance in Sweden In the following data X = number of claims Y = total payment for all the claims in thousands of Swedish Kronor for geographical zones in Sweden Reference: Swedish Committee on Analysis of Risk Premium in Motor Insurance http://college.hmco.com/mathematics/brase/understandable_statistics/7e/students/datasets/ slr/frames/frame.html X Y 108 392,5 19 46,2 13 15,7 124 422,2 40 119,4 57 170,9 23 56,9 14 77,5 45 214
# Create the Python path pointing to the `AutoInsurSweden.txt` file
insurance_data_path: Path = data_dir / 'AutoInsurSweden.txt'
# Read the `AutoInsurSweden.txt` file, extract the `x` and `y` values via regex and store them into vectors
xs: List[float] = []
ys: List[float] = []
with open(insurance_data_path) as file:
content: str = file.read()
for x, y in re.findall(r'([\d,]+)\t([\d,]+)', content):
xs.append(float(x.replace(',', '.')))
ys.append(float(y.replace(',', '.')))
# A convenience function which creates a scatter plot with an optional line
def plot(xs: List[float], ys: List[float], ys_pred: Union[List[float], None] = None) -> None:
fig = px.scatter(x=xs, y=ys, labels={'x': 'Number of claims', 'y': 'Total payment'})
# If present, add the line
if ys_pred:
fig.add_trace(
go.Scatter(
x=xs, y=ys_pred, name='Guess'
)
)
fig.show()
plot(xs, ys)
# The linear function which describes a line
# Our goal is to find `m` and `b` such that the line most accurately "describes" the insurance data points
def predict(m: float, b: float, x: float) -> float:
return m * x + b
assert predict(m=0, b=0, x=3) == 0
# SSE (sum of squared estimate of errors), the function we use to calculate how "wrong" we are
# "How much do the actual y values (`ys`) differ from our predicted y values (`ys_pred`)?"
def sum_squared_error(ys: List[float], ys_pred: List[float]) -> float:
assert len(ys) == len(ys_pred)
return sum([(y - ys_pred) ** 2 for y, ys_pred in zip(ys, ys_pred)])
assert sum_squared_error([1, 2, 3], [4, 5, 6]) == 27
# Our initial guess as to what `m` and `b` might be
m: float = 0
b: float = 200
# Predicting the y values based on our initial guess for the line
ys_pred: List[float] = [predict(m, b, x) for x in xs]
# Visualize the result
plot(xs, ys, ys_pred)
print(f'Initial guess for "m": {m}')
print(f'Initial guess for "b": {b}')
# Calculate how "off" we are via SSE
loss: float = sum_squared_error(ys, ys_pred)
print(f'SSE: {loss}')
Initial guess for "m": 0 Initial guess for "b": 200 SSE: 1125865.2999999996
# Find the best fitting line through the data points via Gradient Descent
m: float = 0
b: float = 200
print(f'Starting with "m": {m}')
print(f'Starting with "b": {b}')
epochs: int = 10000
learning_rate: float = 0.00001
for epoch in range(epochs):
# Calculate predictions for `y` values given the current `m` and `b`
ys_pred: List[float] = [predict(m, b, x) for x in xs]
# Calculate and print the error
if epoch % 1000 == True:
loss: float = sum_squared_error(ys, ys_pred)
print(f'Epoch {epoch} --> loss: {loss}')
# Calculate the gradient
# Taking the (partial) derivative of SSE with respect to `m` results in `2 * x ((m * x + b) - y)`
grad_m: float = sum([2 * (predict(m, b, x) - y) * x for x, y in zip(xs, ys)])
# Taking the (partial) derivative of SSE with respect to `b` results in `2 ((m * x + b) - y)`
grad_b: float = sum([2 * (predict(m, b, x) - y) for x, y in zip(xs, ys)])
# Take a small step in the direction of greatest decrease
m = m + (grad_m * -learning_rate)
b = b + (grad_b * -learning_rate)
print(f'Best estimate for "m": {m}')
print(f'Best estimate for "b": {b}')
plot(xs, ys, ys_pred)
Starting with "m": 0 Starting with "b": 200 Epoch 1 --> loss: 1111304.0949169993 Epoch 1001 --> loss: 367095.40067246475 Epoch 2001 --> loss: 159429.33008833785 Epoch 3001 --> loss: 101348.4050032304 Epoch 4001 --> loss: 85104.08618286082 Epoch 5001 --> loss: 80560.80637884357 Epoch 6001 --> loss: 79290.12266438038 Epoch 7001 --> loss: 78934.73246790287 Epoch 8001 --> loss: 78835.33543438878 Epoch 9001 --> loss: 78807.53565158854 Best estimate for "m": 3.4071723383619705 Best estimate for "b": 20.302521479691976