Statistical Computing Languages
Python
Statistical Computing Languages, RSS
# rerun Fernando's script to load in css
%run talktools
Python has the style of a scripting language (cf perl
, bash
)
# Open my web page and look for links (modified by code on Guido's blog)
import re
import urllib
regex = re.compile(r'href="([^"]+\.html)"')
def matcher(url, max=10):
"Print the first several URL references in a given url."
data = urllib.urlopen(url).read()
hits = regex.findall(data)
return hits
hits = matcher("http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/index.html")
for hit in hits:
print hit
S
, MATLAB
)java
).import pandas as pd
import numpy as np
class my_data(pd.DataFrame):
def pronounce(self):
print "Extending existing objects is easy, this encourages code reuse and good programming practice."
a = my_data(np.zeros((3, 3)))
a.pronounce()
a.describe()
MATLAB
.pandas
: data anlaysis in python.R
. Proving very popular for data analysis.scipy
stack catpures a range of scientific computing facilities (FFT, linear algebra etc)scikit-learn
encodes models from a machine learning perspective.statsmodels
encodes models from a statistics perspective.MATLAB
, R
, Julia
... MATLAB
code base to Python.R
and octave
which are GPL licensed)pandas
to load in data.import pandas as pd # first we import the pandas library
df = pd.read_csv('../R/data/regression.csv') # now we read in the data
# gender is stored as male/female. We convert it to 0/1
df['Gender'] = df['Sex'].map( {'Female': 0, 'Male': 1} ).astype(int)
df.describe()
matplotlib
library.# import a plotting library: matplotlib
import matplotlib.pyplot as plt
# ask for plots to appear in the notebook
%matplotlib inline
fig, ax = plt.subplots(figsize=(10, 7))
ax.scatter(df.Age, df.OI)
ax.set_xlabel('Age')
ax.set_ylabel('OI')
pandas
allows for removal by indexing according to 'truth values'.df = df[df['Age']>0]
fig, ax = plt.subplots(figsize=(10, 7))
for symbol, sex in zip(['bo', 'ro'], ['Male', 'Female']):
lines= ax.semilogy(df.Age[df.Sex==sex], df.OI[df.Sex==sex], symbol,linewidth=3,markersize=10)
ax.set_xlabel('Age').set_fontsize(18)
ax.set_ylabel('OI').set_fontsize(18)
ax.set_title('Plot of age against OI on a log scale').set_fontsize(20)
statsmodels
.
import statsmodels.formula.api as sm
df['Eins'] = 1
Y = np.log(df.OI)
Y.name = 'np.log(OI)'
X = df[['Age','Gender', 'Eins']]
result = sm.OLS( Y, X ).fit()
result.summary()
#import statsmodels.graphics.regressionplots as rp
#rp.plot_fit(results, 1)
statsmodels
seems designed for those with experience of statistics and R
.
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Fit regression model (using the natural log for the dependent variable)
results = smf.ols('np.log(OI) ~ Age + Sex', data=df).fit()
# Inspect the results
results.summary()
R
datasets.import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
# Inspect the results
results.summary()
scikit-learn
would be more familiar.from sklearn.linear_model import LinearRegression
model = LinearRegression()
X, y = df[['Age', 'Gender']], np.log(df['OI']).values[:, None]
model.fit( X, y )
score = '{0:.3f}'.format( model.score( X, y ) )
print model.coef_
import numpy as np
beta = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, y))
beta = pd.DataFrame(beta, index=X.columns)
beta
Although Darren Wilkinson pointed out to me at the meeting, that this is more correctly done with QR decomposition (see this page for a description).
$$\mathbf{X}^\top \mathbf{X} \boldsymbol{\beta} = \mathbf{X}^\top \mathbf{y}$$$$(\mathbf{Q}\mathbf{R})^\top (\mathbf{Q}\mathbf{R})\boldsymbol{\beta} = (\mathbf{Q}\mathbf{R})^\top \mathbf{y}$$$$\mathbf{R}^\top (\mathbf{Q}^\top \mathbf{Q}) \mathbf{R} \boldsymbol{\beta} = \mathbf{R}^\top \mathbf{Q}^\top \mathbf{y}$$$$\mathbf{R}^\top \mathbf{R} \boldsymbol{\beta} = \mathbf{R}^\top \mathbf{Q}^\top \mathbf{y}$$$$\mathbf{R} \boldsymbol{\beta} = \mathbf{Q}^\top \mathbf{y}$$So if we let $\mathbf{z} = \mathbf{Q}^\top \mathbf{y}$, $$\mathbf{R} \boldsymbol{\beta} =\mathbf{z}$$
import numpy as np
import scipy as sp
Q, R = np.linalg.qr(X)
beta = sp.linalg.solve_triangular(R, np.dot(Q.T, y))
beta = pd.DataFrame(beta, index=X.columns)
beta