In [1]:
import numpy
from matplotlib import pyplot
from scipy.stats import t
In [2]:
pyplot.style.use("seaborn")
In [3]:
def get_linear_model(x, y):
    x_mean = x.mean()
    y_mean = y.mean()
    sum_x_squared = numpy.sum((x - x_mean)**2)
    sum_y_squared = numpy.sum((y - y_mean)**2)
    sum_x_y = numpy.sum((x - x_mean) * (y - y_mean))
    
    slope = sum_x_y / sum_x_squared
    intercept = y_mean - x_mean * slope

    predictions = x * slope + intercept
    sum_squared_error = numpy.sum((y - predictions)**2)
    standard_error = numpy.sqrt(sum_squared_error / ((len(x) - 2) * sum_x_squared))
    
    return { "slope": slope, "intercept": intercept, "standard_error": standard_error }

def predict(x, model):
    return x * model["slope"] + model["intercept"]
In [16]:
real_slope = 0.3

real_x = numpy.random.normal(0,1, size=10)
noise = numpy.random.normal(0,1, size=10)
real_y = real_x * real_slope  +  noise

model = get_linear_model(real_x, real_y)
print(model) 

x_extent = numpy.array([real_x.min(), real_x.max()])
print(predict(x_extent, model))
{'slope': 0.32568102722780634, 'intercept': -0.12031786445452539, 'standard_error': 0.41477410855620606}
[-0.57190105  0.40724753]
In [17]:
pyplot.scatter(real_x, real_y)
pyplot.plot( x_extent, predict(x_extent, model) )
pyplot.show()
In [18]:
pyplot.scatter(numpy.sort(real_x), numpy.sort(real_y))
pyplot.show()
In [19]:
def permute(y):
    # shuffle modifies the array itself, so first make a copy
    permuted_y = y.copy()
    # now randomize it
    numpy.random.shuffle(permuted_y)
    return permuted_y
In [20]:
palette = pyplot.get_cmap("Set1")
In [21]:
for panel in range(1, 22):
    pyplot.subplot(3,7, panel)
    
    if panel == 1:
        pyplot.scatter(real_x, real_y, color=palette(0))
        pyplot.plot( x_extent, predict(x_extent, model), color=palette(0) )
    else:
        fake_y = permute(real_y)
        fake_model = get_linear_model(real_x, fake_y)
        pyplot.scatter(real_x, fake_y, color=palette(1))
        if fake_model["slope"] > model["slope"]:
            color = palette(0)
        else:
            color = palette(1)
        pyplot.plot( x_extent, predict(x_extent, fake_model), color=color )
    pyplot.tick_params(labelbottom=False)
    pyplot.tick_params(labelleft=False)


pyplot.show()
In [22]:
fake_slopes = numpy.zeros(1000)

steeper_slopes = 0

for i in range(1000):
    fake_y = permute(real_y)
    fake_model = get_linear_model(real_x, fake_y)
    
    if numpy.abs(fake_model["slope"]) > numpy.abs(model["slope"]):
        steeper_slopes += 1
    
    fake_slopes[i] = fake_model["slope"] / fake_model["standard_error"]

print((steeper_slopes / 2) / 1000, 1 - t.cdf(numpy.abs(model["slope"]) / model["standard_error"], df=8))
0.2105 0.22747921658685377
In [11]:
bins = numpy.linspace(-5, 5, 30)
pyplot.hist(fake_slopes, bins=bins)
pyplot.plot(bins, t.pdf(bins, df=8) * 1000 * (bins[1] - bins[0])) # probability density * N * width of bin
pyplot.show()