Fitting lines


In [1]:
# Numerical arrays and fitting lines.
import numpy as np

# Plots.
import matplotlib.pyplot as plt
In [2]:
# Nicer plot style.
plt.style.use("ggplot")

# Bigger plots.
plt.rcParams["figure.figsize"] = (18,10)


Fitting a straight line


In [3]:
# Create some x values.
x = np.linspace(0.0, 10.0, 100)
In [4]:
# Have a look.
x
Out[4]:
array([ 0.        ,  0.1010101 ,  0.2020202 ,  0.3030303 ,  0.4040404 ,
        0.50505051,  0.60606061,  0.70707071,  0.80808081,  0.90909091,
        1.01010101,  1.11111111,  1.21212121,  1.31313131,  1.41414141,
        1.51515152,  1.61616162,  1.71717172,  1.81818182,  1.91919192,
        2.02020202,  2.12121212,  2.22222222,  2.32323232,  2.42424242,
        2.52525253,  2.62626263,  2.72727273,  2.82828283,  2.92929293,
        3.03030303,  3.13131313,  3.23232323,  3.33333333,  3.43434343,
        3.53535354,  3.63636364,  3.73737374,  3.83838384,  3.93939394,
        4.04040404,  4.14141414,  4.24242424,  4.34343434,  4.44444444,
        4.54545455,  4.64646465,  4.74747475,  4.84848485,  4.94949495,
        5.05050505,  5.15151515,  5.25252525,  5.35353535,  5.45454545,
        5.55555556,  5.65656566,  5.75757576,  5.85858586,  5.95959596,
        6.06060606,  6.16161616,  6.26262626,  6.36363636,  6.46464646,
        6.56565657,  6.66666667,  6.76767677,  6.86868687,  6.96969697,
        7.07070707,  7.17171717,  7.27272727,  7.37373737,  7.47474747,
        7.57575758,  7.67676768,  7.77777778,  7.87878788,  7.97979798,
        8.08080808,  8.18181818,  8.28282828,  8.38383838,  8.48484848,
        8.58585859,  8.68686869,  8.78787879,  8.88888889,  8.98989899,
        9.09090909,  9.19191919,  9.29292929,  9.39393939,  9.49494949,
        9.5959596 ,  9.6969697 ,  9.7979798 ,  9.8989899 , 10.        ])
In [5]:
# Make sure we got the correct amount asked for.
len(x)
Out[5]:
100
In [6]:
# Create y values based on the x values.
y = 3.0 * x + 2.0
In [7]:
# Have a look.
y
Out[7]:
array([ 2.        ,  2.3030303 ,  2.60606061,  2.90909091,  3.21212121,
        3.51515152,  3.81818182,  4.12121212,  4.42424242,  4.72727273,
        5.03030303,  5.33333333,  5.63636364,  5.93939394,  6.24242424,
        6.54545455,  6.84848485,  7.15151515,  7.45454545,  7.75757576,
        8.06060606,  8.36363636,  8.66666667,  8.96969697,  9.27272727,
        9.57575758,  9.87878788, 10.18181818, 10.48484848, 10.78787879,
       11.09090909, 11.39393939, 11.6969697 , 12.        , 12.3030303 ,
       12.60606061, 12.90909091, 13.21212121, 13.51515152, 13.81818182,
       14.12121212, 14.42424242, 14.72727273, 15.03030303, 15.33333333,
       15.63636364, 15.93939394, 16.24242424, 16.54545455, 16.84848485,
       17.15151515, 17.45454545, 17.75757576, 18.06060606, 18.36363636,
       18.66666667, 18.96969697, 19.27272727, 19.57575758, 19.87878788,
       20.18181818, 20.48484848, 20.78787879, 21.09090909, 21.39393939,
       21.6969697 , 22.        , 22.3030303 , 22.60606061, 22.90909091,
       23.21212121, 23.51515152, 23.81818182, 24.12121212, 24.42424242,
       24.72727273, 25.03030303, 25.33333333, 25.63636364, 25.93939394,
       26.24242424, 26.54545455, 26.84848485, 27.15151515, 27.45454545,
       27.75757576, 28.06060606, 28.36363636, 28.66666667, 28.96969697,
       29.27272727, 29.57575758, 29.87878788, 30.18181818, 30.48484848,
       30.78787879, 31.09090909, 31.39393939, 31.6969697 , 32.        ])
In [8]:
# There should be the same number of them.
len(y)
Out[8]:
100
In [9]:
# Plot x vs y.
plt.plot(x, y)
Out[9]:
[<matplotlib.lines.Line2D at 0x2399452b5e0>]
In [10]:
# Ask numpy what, based only on the x and y values,
# what it thinks the relationship between x and y is.
np.polyfit(x, y, 1)
Out[10]:
array([3., 2.])


Include some noise


In [11]:
# Add some noise - you might try increasing the noise yourself.
y = 3.0 * x + 2.0 + np.random.normal(0.0, 0.3, len(x))
In [12]:
# Have a look.
y
Out[12]:
array([ 2.25182045,  2.04697935,  2.75816048,  2.72587145,  3.31030953,
        3.26815165,  4.12030595,  3.9422849 ,  4.16329771,  4.85121506,
        5.32739357,  5.73731497,  5.79870485,  6.18411457,  6.43361871,
        6.70945849,  7.01538877,  7.1560419 ,  7.68301217,  8.47624419,
        8.12576437,  8.70365607,  9.36621669,  9.29943223,  9.40893948,
        9.28244768,  9.8119984 , 10.46911295, 10.56318299, 10.73541164,
       10.61038333, 11.25558848, 11.62705465, 11.97382863, 12.25900915,
       12.60247336, 13.10627551, 13.10601948, 13.71225391, 13.62756097,
       14.03539597, 14.12133361, 14.92000636, 15.28820574, 14.95487755,
       16.16884208, 16.13624638, 16.25404006, 16.4104769 , 16.92267383,
       17.22146602, 17.69664052, 17.58636777, 18.15932648, 18.46659384,
       18.38345605, 19.07303252, 18.97584274, 19.24104591, 19.86666466,
       19.80869216, 20.50548947, 20.92696946, 20.90782713, 21.54611264,
       21.80778344, 22.37494431, 22.70618234, 22.81651129, 22.98874412,
       23.4806887 , 23.05761525, 23.35627904, 24.37489591, 24.26919502,
       24.96751308, 24.89203076, 25.00986091, 25.6917781 , 25.51007312,
       25.94709913, 26.4384208 , 26.8785489 , 26.86124299, 27.37315616,
       27.8439393 , 28.12325066, 29.35450396, 29.09445521, 29.05870172,
       29.13327274, 29.91184334, 29.55587019, 30.22914905, 30.73005066,
       30.42541106, 31.47959928, 31.55074943, 31.37104774, 32.21999387])
In [13]:
# Plot x vs y as points.
plt.plot(x, y, '.');
In [14]:
# Now what does numpy say the relationship is?
coeffs = np.polyfit(x, y, 1)
coeffs
Out[14]:
array([2.9907589 , 2.08688875])
In [15]:
# Plot the best fit line over the data points.
plt.plot(x, y, '.', label="Data")
plt.plot(x, coeffs[0] * x + coeffs[1], '-', label='Best fit')
plt.legend();


More than one dataset on one plot


In [16]:
# Let's create two data sets, each with x and y values.
x1 = np.linspace(0.0, 0.5, 20)
In [17]:
y1 = 3.0 * x1 + 2.0 + np.random.normal(0.0, 1.0, len(x1))
In [18]:
x2 = np.linspace(0.0, 0.5, 30)
In [19]:
y2 = 6.0 * x2 + 5.0 + np.random.normal(0.0, 1.0, len(x2))
In [20]:
# Fit each using polyfit.
coeffs1 = np.polyfit(x1, y1, 1)
coeffs1
Out[20]:
array([1.80573562, 2.21192559])
In [21]:
coeffs2 = np.polyfit(x2, y2, 1)
coeffs2
Out[21]:
array([6.23770296, 5.23389885])
In [22]:
# Plot both on one plot.
plt.plot(x1, y1, '.', label="Data set 1")
plt.plot(x1, coeffs1[0] * x1 + coeffs1[1], '-', label='Best fit line 1')
plt.plot(x2, y2, '.', label="Data set 2")
plt.plot(x2, coeffs2[0] * x2 + coeffs2[1], '-', label='Best fit line 2')
plt.legend();


Combine data sets


In [23]:
# How about we combine the datasets, for no particular reason.
x = np.concatenate([x1, x2])
In [24]:
y = np.concatenate([y1, y2])
In [25]:
# Fit a line to the combined data set.
coeffs = np.polyfit(x, y, 1)
coeffs
Out[25]:
array([4.42928212, 4.03401803])
In [26]:
# What does the x/y relationship look like in the combined
# versus the original?
plt.plot(x1, y1, '.', label="Data set 1")
plt.plot(x1, coeffs1[0] * x1 + coeffs1[1], '-', label='Best fit line 1')
plt.plot(x2, y2, '.', label="Data set 2")
plt.plot(x2, coeffs2[0] * x2 + coeffs2[1], '-', label='Best fit line 2')
# plt.plot(x, y, '.', label="Combined")
plt.plot(x, coeffs[0] * x + coeffs[1], '-', label='Best fit combined')
plt.legend();

End