#!/usr/bin/env python # coding: utf-8 # # Chapter 14: Statistical modelling # Robert Johansson # # Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://www.apress.com/us/book/9781484242452) (ISBN 978-1-484242-45-2). # In[1]: import statsmodels.api as sm # In[2]: import statsmodels.formula.api as smf # In[3]: import statsmodels.graphics.api as smg # In[4]: import patsy # In[5]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[6]: import numpy as np # In[7]: import pandas as pd # In[8]: from scipy import stats # In[9]: import seaborn as sns # ## Statistical models and patsy formula # In[10]: np.random.seed(123456789) # In[11]: y = np.array([1, 2, 3, 4, 5]) # In[12]: x1 = np.array([6, 7, 8, 9, 10]) # In[13]: x2 = np.array([11, 12, 13, 14, 15]) # In[14]: X = np.vstack([np.ones(5), x1, x2, x1*x2]).T # In[15]: X # In[16]: beta, res, rank, sval = np.linalg.lstsq(X, y) # In[17]: beta # In[18]: data = {"y": y, "x1": x1, "x2": x2} # In[19]: y, X = patsy.dmatrices("y ~ 1 + x1 + x2 + x1*x2", data) # In[20]: y # In[21]: X # In[22]: type(X) # In[23]: np.array(X) # In[24]: df_data = pd.DataFrame(data) # In[25]: y, X = patsy.dmatrices("y ~ 1 + x1 + x2 + x1:x2", df_data, return_type="dataframe") # In[26]: X # In[27]: model = sm.OLS(y, X) # In[28]: result = model.fit() # In[29]: result.params # In[30]: model = smf.ols("y ~ 1 + x1 + x2 + x1:x2", df_data) # In[31]: result = model.fit() # In[32]: result.params # In[33]: print(result.summary()) # In[34]: beta # In[35]: from collections import defaultdict # In[36]: data = defaultdict(lambda: np.array([1,2,3])) # In[37]: patsy.dmatrices("y ~ a", data=data)[1].design_info.term_names # In[38]: patsy.dmatrices("y ~ 1 + a + b", data=data)[1].design_info.term_names # In[39]: patsy.dmatrices("y ~ -1 + a + b", data=data)[1].design_info.term_names # In[40]: patsy.dmatrices("y ~ a * b", data=data)[1].design_info.term_names # In[41]: patsy.dmatrices("y ~ a * b * c", data=data)[1].design_info.term_names # In[42]: patsy.dmatrices("y ~ a * b * c - a:b:c", data=data)[1].design_info.term_names # In[43]: data = {k: np.array([]) for k in ["y", "a", "b", "c"]} # In[44]: patsy.dmatrices("y ~ a + b", data=data)[1].design_info.term_names # In[45]: patsy.dmatrices("y ~ I(a + b)", data=data)[1].design_info.term_names # In[46]: patsy.dmatrices("y ~ a*a", data=data)[1].design_info.term_names # In[47]: patsy.dmatrices("y ~ I(a**2)", data=data)[1].design_info.term_names # In[48]: patsy.dmatrices("y ~ np.log(a) + b", data=data)[1].design_info.term_names # In[49]: z = lambda x1, x2: x1+x2 # In[50]: patsy.dmatrices("y ~ z(a, b)", data=data)[1].design_info.term_names # ### Categorical variables # In[51]: data = {"y": [1, 2, 3], "a": [1, 2, 3]} # In[52]: patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1] # In[53]: patsy.dmatrices("y ~ - 1 + C(a)", data=data, return_type="dataframe")[1] # In[54]: data = {"y": [1, 2, 3], "a": ["type A", "type B", "type C"]} # In[55]: patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1] # In[56]: patsy.dmatrices("y ~ - 1 + C(a, Poly)", data=data, return_type="dataframe")[1] # # Linear regression # In[57]: np.random.seed(123456789) # In[58]: N = 100 # In[59]: x1 = np.random.randn(N) # In[60]: x2 = np.random.randn(N) # In[61]: data = pd.DataFrame({"x1": x1, "x2": x2}) # In[62]: def y_true(x1, x2): return 1 + 2 * x1 + 3 * x2 + 4 * x1 * x2 # In[63]: data["y_true"] = y_true(x1, x2) # In[64]: e = np.random.randn(N) # In[65]: data["y"] = data["y_true"] + e # In[66]: data.head() # In[67]: fig, axes = plt.subplots(1, 2, figsize=(8, 4)) axes[0].scatter(data["x1"], data["y"]) axes[1].scatter(data["x2"], data["y"]) fig.tight_layout() # In[68]: data.shape # In[69]: model = smf.ols("y ~ x1 + x2", data) # In[70]: result = model.fit() # In[71]: print(result.summary()) # In[72]: result.rsquared # In[73]: result.resid.head() # In[74]: z, p = stats.normaltest(result.resid.values) # In[75]: p # In[76]: result.params # In[77]: fig, ax = plt.subplots(figsize=(8, 4)) smg.qqplot(result.resid, ax=ax) fig.tight_layout() fig.savefig("ch14-qqplot-model-1.pdf") # In[78]: model = smf.ols("y ~ x1 + x2 + x1*x2", data) # In[79]: result = model.fit() # In[80]: print(result.summary()) # In[81]: result.params # In[82]: result.rsquared # In[83]: z, p = stats.normaltest(result.resid.values) # In[84]: p # In[85]: fig, ax = plt.subplots(figsize=(8, 4)) smg.qqplot(result.resid, ax=ax) fig.tight_layout() fig.savefig("ch14-qqplot-model-2.pdf") # In[86]: x = np.linspace(-1, 1, 50) # In[87]: X1, X2 = np.meshgrid(x, x) # In[88]: new_data = pd.DataFrame({"x1": X1.ravel(), "x2": X2.ravel()}) # In[89]: y_pred = result.predict(new_data) # In[90]: y_pred.shape # In[91]: y_pred = y_pred.values.reshape(50, 50) # In[92]: fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) def plot_y_contour(ax, Y, title): c = ax.contourf(X1, X2, Y, 15, cmap=plt.cm.RdBu) ax.set_xlabel(r"$x_1$", fontsize=20) ax.set_ylabel(r"$x_2$", fontsize=20) ax.set_title(title) cb = fig.colorbar(c, ax=ax) cb.set_label(r"$y$", fontsize=20) plot_y_contour(axes[0], y_true(X1, X2), "true relation") plot_y_contour(axes[1], y_pred, "fitted model") fig.tight_layout() fig.savefig("ch14-comparison-model-true.pdf") # ### Datasets from R # In[93]: dataset = sm.datasets.get_rdataset("Icecream", "Ecdat") # In[94]: dataset.title # In[95]: print(dataset.__doc__) # In[96]: dataset.data.info() # In[97]: model = smf.ols("cons ~ -1 + price + temp", data=dataset.data) # In[98]: result = model.fit() # In[99]: print(result.summary()) # In[100]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) smg.plot_fit(result, 0, ax=ax1) smg.plot_fit(result, 1, ax=ax2) fig.tight_layout() fig.savefig("ch14-regressionplots.pdf") # In[101]: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) sns.regplot("price", "cons", dataset.data, ax=ax1); sns.regplot("temp", "cons", dataset.data, ax=ax2); fig.tight_layout() fig.savefig("ch14-regressionplots-seaborn.pdf") # ## Discrete regression, logistic regression # In[102]: df = sm.datasets.get_rdataset("iris").data # In[103]: df.info() # In[104]: df.Species.unique() # In[105]: df_subset = df[(df.Species == "versicolor") | (df.Species == "virginica" )].copy() # In[106]: df_subset = df[df.Species.isin(["versicolor", "virginica"])].copy() # In[107]: df_subset.Species = df_subset.Species.map({"versicolor": 1, "virginica": 0}) # In[108]: df_subset.rename(columns={"Sepal.Length": "Sepal_Length", "Sepal.Width": "Sepal_Width", "Petal.Length": "Petal_Length", "Petal.Width": "Petal_Width"}, inplace=True) # In[109]: df_subset.head(3) # In[110]: model = smf.logit("Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width", data=df_subset) # In[111]: result = model.fit() # In[112]: print(result.summary()) # In[113]: print(result.get_margeff().summary()) # **Note:** Sepal_Length and Sepal_Width do not seem to contribute much to predictiveness of the model. # In[114]: model = smf.logit("Species ~ Petal_Length + Petal_Width", data=df_subset) # In[115]: result = model.fit() # In[116]: print(result.summary()) # In[117]: print(result.get_margeff().summary()) # In[118]: params = result.params beta0 = -params['Intercept']/params['Petal_Width'] beta1 = -params['Petal_Length']/params['Petal_Width'] # In[119]: df_new = pd.DataFrame({"Petal_Length": np.random.randn(20)*0.5 + 5, "Petal_Width": np.random.randn(20)*0.5 + 1.7}) # In[120]: df_new["P-Species"] = result.predict(df_new) # In[121]: df_new["P-Species"].head(3) # In[122]: df_new["Species"] = (df_new["P-Species"] > 0.5).astype(int) # In[123]: df_new.head() # In[124]: fig, ax = plt.subplots(1, 1, figsize=(8, 4)) ax.plot(df_subset[df_subset.Species == 0].Petal_Length.values, df_subset[df_subset.Species == 0].Petal_Width.values, 's', label='virginica') ax.plot(df_new[df_new.Species == 0].Petal_Length.values, df_new[df_new.Species == 0].Petal_Width.values, 'o', markersize=10, color="steelblue", label='virginica (pred.)') ax.plot(df_subset[df_subset.Species == 1].Petal_Length.values, df_subset[df_subset.Species == 1].Petal_Width.values, 's', label='versicolor') ax.plot(df_new[df_new.Species == 1].Petal_Length.values, df_new[df_new.Species == 1].Petal_Width.values, 'o', markersize=10, color="green", label='versicolor (pred.)') _x = np.array([4.0, 6.1]) ax.plot(_x, beta0 + beta1 * _x, 'k') ax.set_xlabel('Petal length') ax.set_ylabel('Petal width') ax.legend(loc=2) fig.tight_layout() fig.savefig("ch14-logit.pdf") # ### Poisson distribution # In[125]: dataset = sm.datasets.get_rdataset("discoveries") # In[126]: df = dataset.data.set_index("time").rename(columns={"value": "discoveries"}) # In[127]: df.head(10).T # In[128]: fig, ax = plt.subplots(1, 1, figsize=(16, 4)) df.plot(kind='bar', ax=ax) fig.tight_layout() fig.savefig("ch14-discoveries.pdf") # In[129]: model = smf.poisson("discoveries ~ 1", data=df) # In[130]: result = model.fit() # In[131]: print(result.summary()) # In[132]: lmbda = np.exp(result.params) # In[133]: X = stats.poisson(lmbda) # In[134]: result.conf_int() # In[135]: X_ci_l = stats.poisson(np.exp(result.conf_int().values)[0, 0]) # In[136]: X_ci_u = stats.poisson(np.exp(result.conf_int().values)[0, 1]) # In[137]: v, k = np.histogram(df.values, bins=12, range=(0, 12), normed=True) # In[138]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.bar(k[:-1], v, color="steelblue", align='center', label='Dicoveries per year') ax.bar(k-0.125, X_ci_l.pmf(k), color="red", alpha=0.5, align='center', width=0.25, label='Poisson fit (CI, lower)') ax.bar(k, X.pmf(k), color="green", align='center', width=0.5, label='Poisson fit') ax.bar(k+0.125, X_ci_u.pmf(k), color="red", alpha=0.5, align='center', width=0.25, label='Poisson fit (CI, upper)') ax.legend() fig.tight_layout() fig.savefig("ch14-discoveries-per-year.pdf") # ## Time series # In[139]: df = pd.read_csv("temperature_outdoor_2014.tsv", header=None, delimiter="\t", names=["time", "temp"]) df.time = pd.to_datetime(df.time, unit="s") df = df.set_index("time").resample("H").mean() # In[140]: df_march = df[df.index.month == 3] # In[141]: df_april = df[df.index.month == 4] # In[142]: df_march.plot(figsize=(12, 4)); # In[143]: fig, axes = plt.subplots(1, 4, figsize=(12, 3)) smg.tsa.plot_acf(df_march.temp, lags=72, ax=axes[0]) smg.tsa.plot_acf(df_march.temp.diff().dropna(), lags=72, ax=axes[1]) smg.tsa.plot_acf(df_march.temp.diff().diff().dropna(), lags=72, ax=axes[2]) smg.tsa.plot_acf(df_march.temp.diff().diff().diff().dropna(), lags=72, ax=axes[3]) fig.tight_layout() fig.savefig("ch14-timeseries-autocorrelation.pdf") # In[144]: model = sm.tsa.AR(df_march.temp) # In[145]: result = model.fit(72) # In[146]: sm.stats.durbin_watson(result.resid) # In[147]: fig, ax = plt.subplots(1, 1, figsize=(8, 3)) smg.tsa.plot_acf(result.resid, lags=72, ax=ax) fig.tight_layout() fig.savefig("ch14-timeseries-resid-acf.pdf") # In[148]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.plot(df_march.index.values[-72:], df_march.temp.values[-72:], label="train data") ax.plot(df_april.index.values[:72], df_april.temp.values[:72], label="actual outcome") ax.plot(pd.date_range("2014-04-01", "2014-04-4", freq="H").values, result.predict("2014-04-01", "2014-04-4"), label="predicted outcome") ax.legend() fig.tight_layout() fig.savefig("ch14-timeseries-prediction.pdf") # In[149]: # Using ARMA model on daily average temperatures # In[150]: df_march = df_march.resample("D").mean() # In[151]: df_april = df_april.resample("D").mean() # In[152]: model = sm.tsa.ARMA(df_march, (4, 1)) # In[153]: result = model.fit() # In[154]: fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.plot(df_march.index.values[-3:], df_march.temp.values[-3:], 's-', label="train data") ax.plot(df_april.index.values[:3], df_april.temp.values[:3], 's-', label="actual outcome") ax.plot(pd.date_range("2014-04-01", "2014-04-3").values, result.predict("2014-04-01", "2014-04-3"), 's-', label="predicted outcome") ax.legend() fig.tight_layout() # # Versions # In[155]: get_ipython().run_line_magic('reload_ext', 'version_information') # In[156]: get_ipython().run_line_magic('version_information', 'numpy, matplotlib, pandas, scipy, statsmodels, patsy')