#!/usr/bin/env python # coding: utf-8 # # Scientific Python Quickstart # #### [John Stachurski](http://johnstachurski.net/) # #### ANU # This is a fast-paced, hands-on introduction to scientific computing with Python, contained in a [Jupyter](http://jupyter.org/) notebook. The main focus will be on introducing Python's four most important scientific libraries: NumPy, Scipy, Pandas and Matplotlib. # # If you don't know how to use this notebook you need to first work through [this page](http://quant-econ.net/py/getting_started.html). # # A slower, more detailed and more systematic treatment of Python for scientific applications can be found at [quant-econ.net](http://quant-econ.net/py/index.html). But this notebook is a good place to start for those who like to learn by doing. # Here's some information on the version of Python that I'm using: # In[241]: import sys print(sys.version) # ## Basic NumPy # Perhaps the single most important scientific library for Python is NumPy. NumPy provides foundational data structures and routines on which many other libraries rely. # In[242]: import numpy as np # Import library and give it alias np print(np.__version__) # The version I'm using # NumPy defines a basic data type called an array (actually a numpy.ndarray) # In[243]: a = np.zeros(3) # Create an array of zeros a # Print a # In[244]: type(a) # Note that array data *must be homogeneous* # # The most important data types are: # # * float64: 64 bit floating point number # * float32: 32 bit floating point number # * int64: 64 bit integer # * int32: 32 bit integer # * bool: 8 bit True or False # # There are also dtypes to represent complex numbers, unsigned integers, etc # # On most machines, the default dtype for arrays is ``float64`` # # # # In[245]: a = np.zeros(3) type(a[1]) # When we create an array such as # # In[246]: z = np.zeros(10) # ``z`` is a "flat" array with no dimension--- neither row nor column vector: # In[247]: z.shape # Here the shape tuple has only one element, which is the length of the array (tuples with one element end with a comma) # # To give it dimension, we can change the ``shape`` attribute # # For example, let's make it a column vector # In[248]: z.shape = (10, 1) # In[249]: z # In[250]: z = np.zeros(4) z.shape = (2, 2) z # ### Creating arrays # Creating empty arrays --- initializing memory: # In[251]: z = np.empty(3) z # These are just garbage numbers --- whatever was in those memory slots # # Here's how to make a regular gird sequence # In[252]: z = np.linspace(2, 4, 5) # From 2 to 4, with 5 elements z # Creating an array of ones # In[253]: z = np.ones(3) z # In[254]: z = np.identity(2) z # Arrays can be made from Python lists or tuples # In[255]: z = np.array([10, 20]) z # In[256]: z = np.array((10, 20), dtype=float) z # In[257]: z = np.array([[1, 2], [3, 4]]) # 2D array from a list of lists z # ### Array indexing # In[258]: z = np.linspace(1, 2, 5) z # In[259]: z[0] # First element --- Python sequences are zero based, like C, Java, etc. # In[260]: z[-1] # Special syntax for last element # In[261]: z[0:2] # Meaning: Two elements, starting from element 0 # In[262]: z = np.array([[1, 2], [3, 4]]) z # In[263]: z[0, 0] # In[264]: z[0,:] # First row # In[265]: z[:,0] # First column # In[266]: z = np.linspace(2, 4, 5) z # In[267]: d = np.array([0, 1, 1, 0, 0], dtype=bool) d # In[268]: z[d] # ### Array methods # In[269]: A = np.array((4, 3, 2, 1)) A # In[270]: A.sort() # In[271]: A # In[272]: A.mean() # In[273]: A.sum() # In[274]: A.max() # In[275]: A.cumsum() # In[276]: A.var() # In[277]: A.shape = (2, 2) A # In[278]: A.T # Transpose, equivalent to A.transpose() # ### Operations on arrays # Standard arithmetic operations on arrays act elementwise # In[279]: a = np.array([1, 2, 3, 4]) b = np.array([5, 6, 7, 8]) # In[280]: a + b # In[281]: a - b # In[282]: a + 10 # In[283]: a.shape = 2, 2 b.shape = 2, 2 # In[284]: a # In[285]: b # In[286]: a * b # Pointwise multiplication!! # In[287]: np.dot(a, b) # Matrix multiplication # For Python $\geq 3.5$ and NumPy $\geq 1.1$ the ``@`` operator also works. # In[288]: a @ b # I'll continue to use ``np.dot`` below for the benefit of those who are using older versions. But in my opinion the ``@`` operator is much nicer. # ### Comparisons # In[289]: z = np.array([2, 3]) y = np.array([2, 3]) z == y # In[290]: y[0] = 3 z == y # In[291]: z = np.linspace(0, 10, 5) z # In[292]: z > 3 # In[293]: z[z > 3] # Conditional extraction # ## Matplotlib # Matplotlib is an outstanding plotting and visualization library for Python that interacts nicely with NumPy. Here are a few quick examples. We'll see more below when we discuss the SciPy library. # In[294]: import matplotlib.pyplot as plt # Import main functionality # Display figures in this browser window rather than having them open up separately: # In[295]: get_ipython().run_line_magic('matplotlib', 'inline') # Create something to plot # In[296]: x = np.linspace(-2, 2, 100) y = x**2 # In[297]: fig, ax = plt.subplots() # Create axes and figure window ax.plot(x, y, 'b-') # Here's a slightly more complex plot # In[298]: y3 = x**3 fig, ax = plt.subplots() # Create axes and figure window ax.plot(x, y, 'b-', lw=2, alpha=0.8, label='$x^2$') ax.plot(x, y3, 'g-', lw=2, alpha=0.8, label='$x^3$') ax.legend(loc='lower right') # ## SciPy # Let's just cover some simple examples --- references for further reading are below # ### Statistics and distributions # Let's use ``scipy.stats`` to generate some data from the Beta distribution # In[299]: from scipy.stats import beta q = beta(5, 5) # Beta(a, b), with a = b = 5 obs = q.rvs(2000) # 2000 observations # Now let's histogram it and compare it to the original density # In[300]: fig, ax = plt.subplots() ax.hist(obs, bins=40, normed=True) grid = np.linspace(0.01, 0.99, 100) ax.plot(grid, q.pdf(grid), 'k-', linewidth=2) # Other methods # In[301]: type(q) # In[302]: dir(q) # Let's see all its methods # In[303]: q.cdf(0.5) # In[304]: q.pdf(0.5) # In[305]: q.mean() # Basic linear regression: # In[306]: from scipy.stats import linregress n = 100 alpha, beta, sigma = 1, 2, 1.5 x = np.random.randn(n) # n standard normals y = alpha + beta * x + sigma * np.random.randn(n) beta_hat, alpha_hat, r_value, p_value, std_err = linregress(x, y) print("gradient = {}".format(beta_hat)) print("intercept = {}".format(alpha_hat)) # Let's plot this with data and line of best fit # In[307]: fig, ax = plt.subplots(figsize=(8, 5)) ax.plot(x, y, 'bo', alpha=0.6, label='observations') xgrid = np.linspace(-3, 3, 2) ax.plot(xgrid, alpha_hat + beta_hat * xgrid, 'k-', lw=2, alpha=0.8, label='best fit') ax.grid() ax.legend(loc='upper left') # ### Roots and fixed points # Let's choose an arbitrary function to work with # In[308]: fig, ax = plt.subplots() def f(x): return np.sin(4 * (x - 0.25)) + x + x**20 - 1 x = np.linspace(0, 1, 100) ax.plot(x, f(x)) ax.plot(x, 0 * x) # In[309]: from scipy.optimize import bisect # Bisection algorithm --- slow but robust bisect(f, 0, 1) # In[310]: from scipy.optimize import newton # Newton's method --- fast but less robust newton(f, 0.2) # Start the search at initial condition x = 0.2 # In[311]: newton(f, 0.7) # Start the search at x = 0.7 instead # Here we see that the algorithm gets it wrong --- ``newton`` is fast but not robust # # Let's try a hybrid method # In[312]: from scipy.optimize import brentq brentq(f, 0, 1) # Hybrid method # In[313]: timeit bisect(f, 0, 1) # In[314]: timeit newton(f, 0.2) # In[315]: timeit brentq(f, 0, 1) # Note that the hybrid method is robust but still quite fast... # ### Numerical optimization and integration # In[316]: from scipy.optimize import fminbound fminbound(lambda x: x**2, -1, 2) # Search in [-1, 2] # In[317]: from scipy.integrate import quad integral, error = quad(lambda x: x**2, 0, 1) integral # ### Linear Algebra # Let's look at some of the most common routines from linear and matrix algebra # In[318]: import scipy.linalg as la # We'll experiment with matrices # # $$ # A = \begin{bmatrix} 2 & -1 \\ 3 & 0 \end{bmatrix} # \quad \text{and} \quad # b = \begin{bmatrix} 1 \\ 1 \end{bmatrix} # $$ # In[319]: A = [[2, -1], [3, 0]] A = np.array(A) # Convert from list to NumPy array b = np.ones((2, 1)) # Shape is 2 x 1 # In[320]: A # In[321]: b # In[322]: x = la.solve(A, b) # Solve for x in Ax = b print(x) # Let's check that $Ax = b$ # In[323]: np.dot(A, x) # We can also invert directly # In[324]: la.inv(A) # In[325]: np.dot(A, la.inv(A)) # Should be the identity # Let's compute the eigenvalues and eigenvectors # In[326]: eigvals, eigvecs = la.eig(A) # In[327]: print("eigenvalues = {}".format(eigvals)) # In[328]: print("first eigenvector = {}".format(eigvecs[:, 0])) # ### More information # * linear algebra: http://docs.scipy.org/doc/scipy/reference/linalg.html # * numerical integration: http://docs.scipy.org/doc/scipy/reference/integrate.html # * interpolation: http://docs.scipy.org/doc/scipy/reference/interpolate.html # * optimization: http://docs.scipy.org/doc/scipy/reference/optimize.html # * distributions and random number generation: http://docs.scipy.org/doc/scipy/reference/stats.html # * signal processing: http://docs.scipy.org/doc/scipy/reference/signal.html # # ## Pandas # Pandas is a very popular library for working with data sets. In pandas, data is held in a dataframe, which is kind of like a spread sheet # In[329]: import pandas as pd # Let's start by writing a test data set to the present working directory, so we can read it back in as a dataframe using pandas. We use an IPython magic to write the data from a cell to a file: # In[330]: get_ipython().run_cell_magic('file', 'test_data.csv', '"country","country isocode","year","POP","XRAT","tcgdp","cc","cg"\n"Argentina","ARG","2000","37335.653","0.9995","295072.21869","75.716805379","5.5788042896"\n"Australia","AUS","2000","19053.186","1.72483","541804.6521","67.759025993","6.7200975332"\n"India","IND","2000","1006300.297","44.9416","1728144.3748","64.575551328","14.072205773"\n"Israel","ISR","2000","6114.57","4.07733","129253.89423","64.436450847","10.266688415"\n"Malawi","MWI","2000","11801.505","59.543808333","5026.2217836","74.707624181","11.658954494"\n"South Africa","ZAF","2000","45064.098","6.93983","227242.36949","72.718710427","5.7265463933"\n"United States","USA","2000","282171.957","1","9898700","72.347054303","6.0324539789"\n"Uruguay","URY","2000","3219.793","12.099591667","25255.961693","78.978740282","5.108067988"\n') # In[331]: get_ipython().run_line_magic('ls', "./*.csv # Check it's there") # In[332]: df = pd.read_csv('./test_data.csv') # In[333]: df # Let's try that again but this time using the country as the index column # In[334]: df = pd.read_csv('./test_data.csv', index_col='country') # In[335]: df # Let's drop the year since it's not very informative # In[336]: df.drop(['year'], axis=1, inplace=True) df # Let's add a column for GDP per capita # In[337]: df['GDP percap'] = df['tcgdp'] / df['POP'] # In[338]: df # Let's sort the whole data frame by GDP per capita # In[339]: df.sort_values(by='GDP percap', inplace=True) # In[340]: df # Now we'll plot per capital GDP using the dataframe's plot method # In[341]: df['GDP percap'].plot(kind='bar') # ## Exercises # Here are two exercises. Feel free to consult documentation such as can be found [here](http://docs.scipy.org/doc/scipy/reference/). The solutions are below. The cell with "solution below" is mean to push them below your line of sight and save you from temptation. # #### Exercise 1 # Generate 10000 data points from the exponential distribution with density # # $$ # f(x; \alpha) = \alpha \exp(-\alpha x) # \qquad # (x > 0, \alpha > 0) # $$ # # using `scipy.stats` and taking $\alpha = 0.5$. Then, after looking up the maximum likelihood estimator of $\alpha$, compute the estimate given your data and check that it is in fact close to $\alpha$. # In[342]: # Put your solution here # #### Exercise 2 # Using the same data set, implement maximum likelihood again, but this time pretending that you don't know the analytical expression for the maximum likelihood estimator. Set up the log likelihood function and maximize it numerically using a routine from `scipy.optimize`. # In[343]: # Put your solution here # ## Solutions # In[344]: # Print some nonsense to partially hide solutions filler_text = "solution below\n" * 25 print(filler_text) # #### Solution to Exercise 1 # After checking [the docs for the exponential distribution](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html) we proceed as follows # In[345]: from scipy.stats import expon alpha = 0.5 n = 10000 ep = expon(scale=1.0/alpha) # scale controls the exponential parameter x = ep.rvs(n) # Let's check we've got the right distribution here # In[346]: fig, ax = plt.subplots(figsize=(8, 5)) xmin, xmax = 0.001, 10.0 ax.set_xlim(xmin, xmax) ax.hist(x, normed=True, bins=40, alpha=0.3) grid = np.linspace(xmin, xmax, 200) ax.plot(grid, ep.pdf(grid), 'g-', lw=2, label='true density') ax.legend() # It's [well-known](http://en.wikipedia.org/wiki/Exponential_distribution) that the MLE of $\alpha$ is $1/\bar x$ where $\bar x$ is the mean of the sample. Let's check that it is indeed close to $\alpha$. # In[347]: alpha_mle = 1.0 / x.mean() print("max likelihood estimate of alpha is {}".format(alpha_mle)) # In[348]: s = x.sum() def neg_loglike(a): "Minus the log likelihood function for exponential" return - n * np.log(a) + a * s # Minimize over a reasonable parameter space # In[349]: from scipy.optimize import fminbound fminbound(neg_loglike, 0.01, 10.0) # This is very close to the analytical value of the max likelihood estimator we got in exercise 1 # In[ ]: