#!/usr/bin/env python # coding: utf-8 # # Chapter 13: Statistics # Robert Johansson # # Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://www.apress.com/us/book/9781484242452) (ISBN 978-1-484242-45-2). # ## Imports # In[1]: from scipy import stats # In[2]: from scipy import optimize # In[3]: import numpy as np import random # In[4]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[5]: import seaborn as sns # In[6]: sns.set(style="whitegrid") # ## Descriptive statistics # In[7]: x = np.array([3.5, 1.1, 3.2, 2.8, 6.7, 4.4, 0.9, 2.2]) # In[8]: np.mean(x) # In[9]: np.median(x) # In[10]: x.min(), x.max() # In[11]: x.var() # In[12]: x.std() # In[13]: x.var(ddof=1) # In[14]: x.std(ddof=1) # ## Random numbers # In[15]: random.seed(123456789) # In[16]: random.random() # In[17]: random.randint(0, 10) # 0 and 10 inclusive # In[18]: np.random.seed(123456789) # In[19]: np.random.rand() # In[20]: np.random.randn() # In[21]: np.random.rand(5) # In[22]: np.random.randn(2, 4) # In[23]: np.random.randint(10, size=10) # In[24]: np.random.randint(low=10, high=20, size=(2, 10)) # In[25]: fig, axes = plt.subplots(1, 3, figsize=(12, 3)) axes[0].hist(np.random.rand(10000)) axes[0].set_title("rand") axes[1].hist(np.random.randn(10000)) axes[1].set_title("randn") axes[2].hist(np.random.randint(low=1, high=10, size=10000), bins=9, align='left') axes[2].set_title("randint(low=1, high=10)") fig.tight_layout() fig.savefig("ch13-random-hist.pdf") # In[26]: #random.sample(range(10), 5) # In[27]: np.random.choice(10, 5, replace=False) # In[28]: np.random.seed(123456789) # In[29]: np.random.rand() # In[30]: np.random.seed(123456789); np.random.rand() # In[31]: np.random.seed(123456789); np.random.rand() # In[32]: prng = np.random.RandomState(123456789) # In[33]: prng.randn(2, 4) # In[34]: prng.chisquare(1, size=(2, 2)) # In[35]: prng.standard_t(1, size=(2, 3)) # In[36]: prng.f(5, 2, size=(2, 4)) # In[37]: prng.binomial(10, 0.5, size=10) # In[38]: prng.poisson(5, size=10) # # Probability distributions and random variables # In[39]: np.random.seed(123456789) # In[40]: X = stats.norm(1, 0.5) # In[41]: X.mean() # In[42]: X.median() # In[43]: X.std() # In[44]: X.var() # In[45]: [X.moment(n) for n in range(5)] # In[46]: X.stats() # In[47]: X.pdf([0, 1, 2]) # In[48]: X.cdf([0, 1, 2]) # In[49]: X.rvs(10) # In[50]: stats.norm(1, 0.5).stats() # In[51]: stats.norm.stats(loc=2, scale=0.5) # In[52]: X.interval(0.95) # In[53]: X.interval(0.99) # In[54]: def plot_rv_distribution(X, axes=None): """Plot the PDF, CDF, SF and PPF of a given random variable""" if axes is None: fig, axes = plt.subplots(1, 3, figsize=(12, 3)) x_min_999, x_max_999 = X.interval(0.999) x999 = np.linspace(x_min_999, x_max_999, 1000) x_min_95, x_max_95 = X.interval(0.95) x95 = np.linspace(x_min_95, x_max_95, 1000) if hasattr(X.dist, 'pdf'): axes[0].plot(x999, X.pdf(x999), label="PDF") axes[0].fill_between(x95, X.pdf(x95), alpha=0.25) else: x999_int = np.unique(x999.astype(int)) axes[0].bar(x999_int, X.pmf(x999_int), label="PMF") axes[1].plot(x999, X.cdf(x999), label="CDF") axes[1].plot(x999, X.sf(x999), label="SF") axes[2].plot(x999, X.ppf(x999), label="PPF") for ax in axes: ax.legend() return axes # In[55]: fig, axes = plt.subplots(3, 3, figsize=(12, 9)) X = stats.norm() plot_rv_distribution(X, axes=axes[0, :]) axes[0, 0].set_ylabel("Normal dist.") X = stats.f(2, 50) plot_rv_distribution(X, axes=axes[1, :]) axes[1, 0].set_ylabel("F dist.") X = stats.poisson(5) plot_rv_distribution(X, axes=axes[2, :]) axes[2, 0].set_ylabel("Poisson dist.") fig.tight_layout() fig.savefig("ch13-distributions.pdf") # In[56]: def plot_dist_samples(X, X_samples, title=None, ax=None): """ Plot the PDF and histogram of samples of a continuous random variable""" if ax is None: fig, ax = plt.subplots(1, 1, figsize=(8, 4)) x_lim = X.interval(.99) x = np.linspace(*x_lim, num=100) ax.plot(x, X.pdf(x), label="PDF", lw=3) ax.hist(X_samples, label="samples", normed=1, bins=75) ax.set_xlim(*x_lim) ax.legend() if title: ax.set_title(title) return ax # In[57]: fig, axes = plt.subplots(1, 3, figsize=(12, 3)) X = stats.t(7.0) plot_dist_samples(X, X.rvs(2000), "Student's t dist.", ax=axes[0]) X = stats.chi2(5.0) plot_dist_samples(X, X.rvs(2000), r"$\chi^2$ dist.", ax=axes[1]) X = stats.expon(0.5) plot_dist_samples(X, X.rvs(2000), "exponential dist.", ax=axes[2]) fig.tight_layout() fig.savefig("ch13-dist-sample.pdf") # In[58]: X = stats.chi2(df=5) # In[59]: X_samples = X.rvs(500) # In[60]: df, loc, scale = stats.chi2.fit(X_samples) # In[61]: df, loc, scale # In[62]: Y = stats.chi2(df=df, loc=loc, scale=scale) # In[63]: fig, ax = plt.subplots(1, 1, figsize=(8, 3)) x_lim = X.interval(.99) x = np.linspace(*x_lim, num=100) ax.plot(x, X.pdf(x), label="original") ax.plot(x, Y.pdf(x), label="recreated") ax.legend() fig.tight_layout() fig.savefig("ch13-max-likelihood-fit.pdf") # In[64]: fig, axes = plt.subplots(1, 2, figsize=(12, 4)) x_lim = X.interval(.99) x = np.linspace(*x_lim, num=100) axes[0].plot(x, X.pdf(x), label="original") axes[0].plot(x, Y.pdf(x), label="recreated") axes[0].legend() axes[1].plot(x, X.pdf(x) - Y.pdf(x), label="error") axes[1].legend() fig.tight_layout() fig.savefig("ch13-max-likelihood-fit.pdf") # # Hypothesis testing # In[65]: np.random.seed(123456789) # In[66]: mu, sigma = 1.0, 0.5 # In[67]: X = stats.norm(mu-0.2, sigma) # In[68]: n = 100 # In[69]: X_samples = X.rvs(n) # In[70]: z = (X_samples.mean() - mu)/(sigma/np.sqrt(n)) # In[71]: z # In[72]: t = (X_samples.mean() - mu)/(X_samples.std(ddof=1)/np.sqrt(n)) # In[73]: t # In[74]: stats.norm().ppf(0.025) # In[75]: 2 * stats.norm().cdf(-abs(z)) # In[76]: 2 * stats.t(df=(n-1)).cdf(-abs(t)) # In[77]: t, p = stats.ttest_1samp(X_samples, mu) # In[78]: t # In[79]: p # In[80]: fig, ax = plt.subplots(figsize=(8, 3)) sns.distplot(X_samples, ax=ax) x = np.linspace(*X.interval(0.999), num=100) ax.plot(x, stats.norm(loc=mu, scale=sigma).pdf(x)) fig.tight_layout() fig.savefig("ch13-hypothesis-test-dist-sample-mean.pdf") # In[81]: n = 50 # In[82]: mu1, mu2 = np.random.rand(2) # In[83]: mu1, mu2 # In[84]: X1 = stats.norm(mu1, sigma) # In[85]: X1_sample = X1.rvs(n) # In[86]: X2 = stats.norm(mu2, sigma) # In[87]: X2_sample = X2.rvs(n) # In[88]: t, p = stats.ttest_ind(X1_sample, X2_sample) # In[89]: t # In[90]: p # In[91]: mu1, mu2 # In[92]: sns.distplot(X1_sample) sns.distplot(X2_sample) # # Nonparameteric methods # In[93]: np.random.seed(0) # In[94]: X = stats.chi2(df=5) # In[95]: X_samples = X.rvs(100) # In[96]: kde = stats.kde.gaussian_kde(X_samples) # In[97]: kde_low_bw = stats.kde.gaussian_kde(X_samples, bw_method=0.25) # In[98]: x = np.linspace(0, 20, 100) # In[99]: fig, axes = plt.subplots(1, 3, figsize=(12, 3)) axes[0].hist(X_samples, normed=True, alpha=0.5, bins=25) axes[1].plot(x, kde(x), label="KDE") axes[1].plot(x, kde_low_bw(x), label="KDE (low bw)") axes[1].plot(x, X.pdf(x), label="True PDF") axes[1].legend() sns.distplot(X_samples, bins=25, ax=axes[2]) fig.tight_layout() fig.savefig("ch13-hist-kde.pdf") # In[100]: kde.resample(10) # In[101]: def _kde_cdf(x): return kde.integrate_box_1d(-np.inf, x) # In[102]: kde_cdf = np.vectorize(_kde_cdf) # In[103]: fig, ax = plt.subplots(1, 1, figsize=(8, 3)) sns.distplot(X_samples, bins=25, ax=ax) x = np.linspace(0, 20, 100) ax.plot(x, kde_cdf(x)) fig.tight_layout() # In[104]: def _kde_ppf(q): return optimize.fsolve(lambda x, q: kde_cdf(x) - q, kde.dataset.mean(), args=(q,))[0] # In[105]: kde_ppf = np.vectorize(_kde_ppf) # In[106]: kde_ppf([0.05, 0.95]) # In[107]: X.ppf([0.05, 0.95]) # # Versions # In[108]: get_ipython().run_line_magic('reload_ext', 'version_information') # In[109]: get_ipython().run_line_magic('version_information', 'numpy, scipy, matplotlib, seaborn')