#!/usr/bin/env python
# coding: utf-8

# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import matplotlib.pyplot as plt

def basic_linear_reg(x, y):
    # linear reg on 2 axes
    length = len(x)
    sum_x = sum(x)
    sum_y = sum(y)
    sum_xsq = sum(map(lambda a: a * a, x))
    sum_xy = sum(map(lambda a, b: a * b, x, y))
    
    a = (sum_xy - sum_x * sum_y / length) / (sum_xsq - sum(x) ** 2 / length)
    b = (sum_y - a * sum_x) / length
    return [a, b]

x = np.random.uniform(0, 100, 1000)
y = np.log(x) + np.random.normal(0, 0.3, 1000)

plt.figure(1)
plt.scatter(x, y, label="log(x) with some noise")
plt.plot(np.arange(1, 100), np.log(np.arange(1, 100)), c="chartreuse", label="log(x) true function")
plt.xlabel("x")
plt.ylabel("f(x) = log(x)")
plt.legend(loc="best")
plt.title("A Basic Log Function")

plt.figure(2)
reg = basic_linear_reg(x, y)
r_x, r_y = zip(*((i, i*reg[0] + reg[1]) for i in range(100)))
plt.plot(r_x, r_y, c='g', label="linear regression")
plt.scatter(x, y - (x * reg[0] + reg[1]), c='red', label="residuals")
plt.xlabel("x")
plt.ylabel("y")
plt.legend(loc="best")

plt.figure(3)
r_x, r_y = zip(*((i, i*reg[0] + reg[1]) for i in range(100)))
plt.plot(r_x, r_y, c='g', label="linear regression")
plt.scatter(x, y - (x * reg[0] + reg[1]), c='red', label="residuals")
plt.xlabel("x")
plt.ylabel("y")
plt.legend(loc="best")


# In[64]:


get_ipython().run_line_magic('matplotlib', 'inline')

from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve

rfg = RandomForestRegressor()
r_x = np.array(r_x)
rfg.fit(x.reshape(-1, 1), y)

plt.plot(r_x, rfg.predict(r_x.reshape(-1, 1)), c="yellow", label = "random forest fit")
plt.plot(np.arange(1, 100), np.log(np.arange(1, 100)), c="chartreuse", label="log(x) true function")
plt.scatter(x, y, label="log(x) with some noise")
plt.xlabel("x")
plt.ylabel("y")

plt.figure(2)
plt.plot(r_x, rfg.predict(r_x.reshape(-1, 1)), c="green", label = "random forest fit")
plt.scatter(x, y - rfg.predict(x.reshape(-1, 1)), c="r", label = "residual")