# import
import numpy as np
import scipy as sp
import timeit
import matplotlib.pyplot as plt
%matplotlib inline
Comparing the time
start = timeit.timeit()
X = range(1000)
pySum = sum([n*n for n in X])
end = timeit.timeit()
print("Total time taken: ", end-start)
Total time taken: -0.006210096431396868
** Learning Scipy **
# reading the web data
data = sp.genfromtxt("data/web_traffic.tsv", delimiter="\t")
print(data[:3])
print(len(data))
[[ 1.00000000e+00 2.27200000e+03] [ 2.00000000e+00 nan] [ 3.00000000e+00 1.38600000e+03]] 743
** Preprocessing and Cleaning the data **
X = data[:, 0]
y = data[:, 1]
# checking for nan values
print(sum(np.isnan(X)))
print(sum(np.isnan(y)))
0 8
** Filtering the nan data **
X = X[~np.isnan(y)]
y = y[~np.isnan(y)]
# checking for nan values
print(sum(np.isnan(X)))
print(sum(np.isnan(y)))
0 0
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(X, y, '.b')
ax.margins(0.2)
plt.xticks([w*24*7 for w in range(0, 6)], ["week %d" %w for w in range(0, 6)])
ax.set_xlabel("Week")
ax.set_ylabel("Hits / Week")
ax.set_title("Web Traffic over weeks")
<matplotlib.text.Text at 0x297739d8710>
** Choosing the right model and learning algorithm **
# creating a error calc fuction
def error(f, x, y):
return np.sum((f(x) - y)**2)
** Linear 1-d model **
# sp's polyfit func do the same
fp1, residuals, rank, sv, rcond = sp.polyfit(X, y, 1, full=True)
print(fp1)
print(residuals)
# generating the one order function
f1 = sp.poly1d(fp1)
# checking error
print("Error : ",error(f1, X, y))
x1 = np.array([-100, np.max(X)+100])
y1 = f1(x1)
ax.plot(x1, y1, c='g', linewidth=2)
ax.legend(["data", "d = %i" % f1.order], loc='best')
fig
[ 2.59619213 989.02487106] [ 3.17389767e+08] Error : 317389767.34
$$ f(x) = 2.59619213 * x + 989.02487106 $$
** Polynomial 2-d **
# sp's polyfit func do the same
fp2 = sp.polyfit(X, y, 2)
print(fp2)
# generating the 2 order function
f2= sp.poly1d(fp2)
# checking error
print("Error : ",error(f2, X, y))
x1= np.linspace(-100, np.max(X)+100, 2000)
y2= f2(x1)
ax.plot(x1, y2, c='r', linewidth=2)
ax.legend(["data", "d = %i" % f1.order, "d = %i" % f2.order], loc='best')
fig
[ 1.05322215e-02 -5.26545650e+00 1.97476082e+03] Error : 179983507.878
What if we want to regress two response output instead of one, As we can see in the graph that there is a steep change in data between week 3 and 4, so let's draw two reponses line, one for the data between week0 and week3.5 and second for week3.5 to week5
# we are going to divide the data on time so
div = 3.5*7*24
X1 = X[X<=div]
Y1 = y[X<=div]
X2 = X[X>div]
Y2 = y[X>div]
# now plotting the both data
fa = sp.poly1d(sp.polyfit(X1, Y1, 1))
fb = sp.poly1d(sp.polyfit(X2, Y2, 1))
fa_error = error(fa, X1, Y1)
fb_error = error(fb, X2, Y2)
print("Error inflection = %f" % (fa_error + fb_error))
Error inflection = 135015350.586215
x1 = np.linspace(-100, X1[-1]+100, 1000)
x2 = np.linspace(X1[-10], X2[-1]+100, 1000)
ya = fa(x1)
yb = fb(x2)
ax.plot(x1, ya, c='#800000', linewidth=2) # brown
ax.plot(x2, yb, c='#FFA500', linewidth=2) # orange
ax.grid(True)
fig
Suppose we choose that function with degree 2 is best fit for our data and want to predict that if everything will go same then when we will hit the 100000 count ??
$$ 0 = f(x) - 100000 = 0.0105322215 * x^2 - 5.26545650 * x + 1974.6082 - 100000 $$SciPy's optimize module has the function fsolve that achieves this, when providing an initial starting position with parameter x0. As every entry in our input data file corresponds to one hour, and we have 743 of them, we set the starting position to some value after that. Let fbt2 be the winning polynomial of degree 2.
print(f2)
2 0.01053 x - 5.265 x + 1975
print(f2 - 100000)
# import
from scipy.optimize import fsolve
reached_max = fsolve(f2-100000, x0=800)/(7*24)
print("100,000 hits/hour expected at week %f" % reached_max[0])
2 0.01053 x - 5.265 x - 9.803e+04 100,000 hits/hour expected at week 19.708090