## Statistics, Data Mining, and Machine Learning in Astronomy¶

I share here a number of the free code examples that I think just scratch the surface of the really good content of the book.

## http://www.astroml.org/¶

Free, open source code to reproduce every example in the book.

## https://github.com/astroML/astroML¶

In [ ]:
from IPython.display import Image

import matplotlib.pyplot as plt

# The following line is an ipython notebook trick
%matplotlib inline

plt.rcParams['figure.figsize'] = 12, 8  # plotsize

In [78]:
Image(filename='text_cover1.png')

Out[78]:

## Bayesian Blocks for Histograms¶

Bayesian Blocks is a dynamic histogramming method which optimizes one of several possible fitness functions to determine an optimal binning for data, where the bins are not necessarily uniform width.

The code below uses a fitness function suitable for event data with possible repeats.

In [30]:
# Author: Jake VanderPlas <[email protected]/* <![CDATA[ */!function(t,e,r,n,c,a,p){try{t=document.currentScript||function(){for(t=document.getElementsByTagName('script'),e=t.length;e--;)if(t[e].getAttribute('data-cfhash'))return t[e]}();if(t&&(c=t.previousSibling)){p=t.parentNode;if(a=c.getAttribute('data-cfemail')){for(e='',r='0x'+a.substr(0,2)|0,n=2;a.length-n;n+=2)e+='%'+('0'+('0x'+a.substr(n,2)^r).toString(16)).slice(-2);p.replaceChild(document.createTextNode(decodeURIComponent(e)),c)}p.removeChild(t)}}catch(u){}}()/* ]]> */>
#   The figure is an example from astroML: see http://astroML.github.com
# Example has been modified slightly by Jonathan Whitmore
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt

from astroML.plotting import hist

In [60]:
# draw a set of variables
np.random.seed(0)
t = np.concatenate([stats.cauchy(-5, 1.8).rvs(500),
stats.cauchy(-4, 0.8).rvs(2000),
stats.cauchy(-1, 0.3).rvs(500),
stats.cauchy(2, 0.8).rvs(1000),
stats.cauchy(4, 1.5).rvs(500)])

# truncate values to a reasonable range
t = t[(t > -15) & (t < 15)]

4323

In [79]:
plt.rcParams['figure.figsize'] = 12, 8  # plotsize

xval = np.linspace(-15, 15, 2000)

jw = stats.cauchy(-5, 1.8).pdf(xval) * 500
jw += stats.cauchy(-4, 0.8).pdf(xval) * 2000
jw += stats.cauchy(-1, 0.3).pdf(xval) * 500
jw += stats.cauchy(2, 0.8).pdf(xval) * 1000
jw += stats.cauchy(4, 1.5).pdf(xval)* 500
jw = jw / 2000.0
plt.xlabel("t", fontsize=25)
plt.ylabel("Some PDF", fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.ylim(0, 0.5)
plt.plot(xval, jw)

Out[79]:
[<matplotlib.lines.Line2D at 0x12423df10>]
In [49]:
stats.cauchy?

In [59]:
plt.rcParams['figure.figsize'] = 12, 8  # plotsize

alpha = .9
# First figure: show normal histogram binning
fig = plt.figure(figsize=(10, 4))

ax1.hist(t, bins=15, histtype='stepfilled', alpha=alpha, normed=True)
ax1.set_xlabel('t')
ax1.set_ylabel('P(t)')

ax2.hist(t, bins=200, histtype='stepfilled', alpha=alpha, normed=True)
ax2.set_xlabel('t')
ax2.set_ylabel('P(t)')

Out[59]:
<matplotlib.text.Text at 0x1222ee4d0>
In [64]:
np.random.seed(0)
N = 10000
mu_gamma_f = [(5, 1.0, 0.1),
(7, 0.5, 0.5),
(9, 0.1, 0.1),
(12, 0.5, 0.2),
(14, 1.0, 0.1)]
true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x)
for (mu, gamma, f) in mu_gamma_f])

jw_modify_true_pdf = np.array([true_pdf(x) for x in t])
x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N))
for (mu, gamma, f) in mu_gamma_f])
np.random.shuffle(x)
x = x[x > -10]
x = x[x < 30]

In [77]:
fig = plt.figure(figsize=(10, 10))
N_values = (500, 5000)
subplots = (211, 212)

for N, subplot in zip(N_values, subplots):
xN = x[:N]
t = np.linspace(-10, 30, 1000)

# plot the results
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')
#     hist(xN, bins='knuth', ax=ax, normed=True,
#          histtype='stepfilled', alpha=0.3,
#          label='Knuth Histogram')
hist(xN, bins='blocks', ax=ax, normed=True,
histtype='step', color='red', linewidth=2.0,
label="Bayesian Blocks")
ax.plot(t, true_pdf(t), '-', color='black',
label="Generating Distribution", linewidth=2.0, alpha=0.5)

# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes, fontsize=25)
ax.set_ylabel('$p(x)$', fontsize=25.0)
ax.legend(loc='upper right', prop=dict(size=15))

if subplot == 212:
ax.set_xlabel('$x$', fontsize=25.0)
#ax.set_xticklabels(fontsize=20)
ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)

plt.show()

An important feature of this method is that the bins are optimal in a quantitative sense, meaning that statistical significance can be attached to the bin configuration. -- p228


• Do not reinvent the wheel -- you often will spend more time doing it in a quantitatively worse way
• This code is free, open source, and available in astroml.

## Extreme Deconvolution of Stellar Data¶

In [11]:
# Author: Jake VanderPlas
#   The figure produced by this code is published in the textbook
#   "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
#   To report a bug or issue, use the following forum:

import numpy as np
from matplotlib import pyplot as plt

from astroML.density_estimation import XDGMM
from astroML.crossmatch import crossmatch
from astroML.datasets import fetch_sdss_S82standards, fetch_imaging_sample
from astroML.plotting.tools import draw_ellipse
from astroML.decorators import pickle_results
from astroML.stats import sigmaG

In [12]:
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX.  This may
# result in an error if LaTeX is not installed on your system.  In that case,
# you can set usetex to False.
#from astroML.plotting import setup_text_plots
#setup_text_plots(fontsize=8, usetex=False)

In [13]:
#------------------------------------------------------------
# define u-g-r-i-z extinction from Berry et al, arXiv 1111.4985
# multiply extinction by A_r
extinction_vector = np.array([1.810, 1.400, 1.0, 0.759, 0.561])

#----------------------------------------------------------------------
# Fetch and process the noisy imaging data
data_noisy = fetch_imaging_sample()

# select only stars
data_noisy = data_noisy[data_noisy['type'] == 6]

# Get the extinction-corrected magnitudes for each band
X = np.vstack([data_noisy[f + 'RawPSF'] for f in 'ugriz']).T
Xerr = np.vstack([data_noisy[f + 'psfErr'] for f in 'ugriz']).T

# extinction terms from Berry et al, arXiv 1111.4985
X -= (extinction_vector * data_noisy['rExtSFD'][:, None])

#----------------------------------------------------------------------
# Fetch and process the stacked imaging data
data_stacked = fetch_sdss_S82standards()

# cut to RA, DEC range of imaging sample
RA = data_stacked['RA']
DEC = data_stacked['DEC']
data_stacked = data_stacked[(RA > 0) & (RA < 10) &
(DEC > -1) & (DEC < 1)]

# get stacked magnitudes for each band
Y = np.vstack([data_stacked['mmu_' + f] for f in 'ugriz']).T
Yerr = np.vstack([data_stacked['msig_' + f] for f in 'ugriz']).T

# extinction terms from Berry et al, arXiv 1111.4985
Y -= (extinction_vector * data_stacked['A_r'][:, None])

# quality cuts
g = Y[:, 1]
mask = ((Yerr.max(1) < 0.05) &
(g < 20))

/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/astropy/logger.py:531: RuntimeWarning: log file u'/Users/jwhitmore/.astropy/config/astropy.log' could not be opened for writing: [Errno 13] Permission denied: u'/Users/jwhitmore/.astropy/config/astropy.log'
'{1}'.format(log_file_path, unicode(e)), RuntimeWarning)

In [14]:
#----------------------------------------------------------------------
# cross-match
#  the imaging sample contains both standard and variable stars.  We'll
#  perform a cross-match with the standard star catalog and choose objects
#  which are common to both.
Xlocs = np.hstack((data_noisy['ra'][:, np.newaxis],
data_noisy['dec'][:, np.newaxis]))
Ylocs = np.hstack((data_stacked['RA'][:, np.newaxis],
data_stacked['DEC'][:, np.newaxis]))

print "number of noisy points:  ", Xlocs.shape
print "number of stacked points:", Ylocs.shape

# find all points within 0.9 arcsec.  This cutoff was selected
# by plotting a histogram of the log(distances).
dist, ind = crossmatch(Xlocs, Ylocs, max_distance=0.9 / 3600.)

# select the data

# double-check that our cross-match succeeded
assert X.shape == Y.shape
print "size after crossmatch:", X.shape

number of noisy points:   (82003, 2)
number of stacked points: (13377, 2)
size after crossmatch: (12313, 5)

In [15]:
#----------------------------------------------------------------------
# perform extreme deconvolution on the noisy sample

# first define mixing matrix W
W = np.array([[0, 1, 0, 0, 0],    # g magnitude
[1, -1, 0, 0, 0],   # u-g color
[0, 1, -1, 0, 0],   # g-r color
[0, 0, 1, -1, 0],   # r-i color
[0, 0, 0, 1, -1]])  # i-z color

X = np.dot(X, W.T)
Y = np.dot(Y, W.T)

# compute error covariance from mixing matrix
Xcov = np.zeros(Xerr.shape + Xerr.shape[-1:])
Xcov[:, range(Xerr.shape[1]), range(Xerr.shape[1])] = Xerr ** 2

# each covariance C = WCW^T
# best way to do this is with a tensor dot-product
Xcov = np.tensordot(np.dot(Xcov, W.T), W, (-2, -1))

In [16]:
#----------------------------------------------------------------------
# This is a long calculation: save results to file
@pickle_results("XD_stellar.pkl")
def compute_XD(n_clusters=12, rseed=0, n_iter=100, verbose=True):
np.random.seed(rseed)
clf = XDGMM(n_clusters, n_iter=n_iter, tol=1E-5, verbose=verbose)
clf.fit(X, Xcov)
return clf

clf = compute_XD(12)

@pickle_results: using precomputed results from 'XD_stellar.pkl'

In [17]:
#------------------------------------------------------------
# Fit and sample from the underlying distribution
np.random.seed(42)
X_sample = clf.sample(X.shape[0])

In [18]:
#------------------------------------------------------------
# plot the results
fig = plt.figure(figsize=(10, 10))
bottom=0.1, top=0.95,
wspace=0.02, hspace=0.02)

# only plot 1/10 of the stars for clarity
ax1.scatter(Y[::10, 2], Y[::10, 3], s=9, lw=0, c='k')

ax2.scatter(X[::10, 2], X[::10, 3], s=9, lw=0, c='k')

ax3.scatter(X_sample[::10, 2], X_sample[::10, 3], s=9, lw=0, c='k')

for i in range(clf.n_components):
draw_ellipse(clf.mu[i, 2:4], clf.V[i, 2:4, 2:4], scales=[2],
ec='k', fc='gray', alpha=0.2, ax=ax4)

titles = ["Standard Stars", "Single Epoch",
"Extreme Deconvolution\n  resampling",
"Extreme Deconvolution\n  cluster locations"]
ax = [ax1, ax2, ax3, ax4]

for i in range(4):
ax[i].set_xlim(-0.6, 1.8)
ax[i].set_ylim(-0.6, 1.8)

ax[i].xaxis.set_major_locator(plt.MultipleLocator(0.5))
ax[i].yaxis.set_major_locator(plt.MultipleLocator(0.5))

ax[i].text(0.05, 0.95, titles[i],
ha='left', va='top', transform=ax[i].transAxes)

if i in (0, 1):
ax[i].xaxis.set_major_formatter(plt.NullFormatter())
else:
ax[i].set_xlabel('$g-r$')

if i in (1, 3):
ax[i].yaxis.set_major_formatter(plt.NullFormatter())
else:
ax[i].set_ylabel('$r-i$')

In [19]:
#------------------------------------------------------------
# Second figure: the width of the locus
fig = plt.figure(figsize=(10, 10))

labels = ['single epoch', 'standard stars', 'XD resampled']
linestyles = ['solid', 'dashed', 'dotted']
for data, label, ls in zip((X, Y, X_sample), labels, linestyles):
g = data[:, 0]
gr = data[:, 2]
ri = data[:, 3]

r = g - gr
i = r - ri

mask = (gr > 0.3) & (gr < 1.0)

w = -0.227 * g + 0.792 * r - 0.567 * i + 0.05

sigma = sigmaG(w)

ax.hist(w, bins=np.linspace(-0.08, 0.08, 100), linestyle=ls,
histtype='step', label=label + '\n\t' + r'$\sigma_G=%.3f$' % sigma,
normed=True)

ax.legend(loc=2)
ax.text(0.95, 0.95, '$w = -0.227g + 0.792r$\n$- 0.567i + 0.05$',
transform=ax.transAxes, ha='right', va='top')

ax.set_xlim(-0.07, 0.07)
ax.set_ylim(0, 55)

ax.set_xlabel('$w$')
ax.set_ylabel('$N(w)$')

Out[19]:
<matplotlib.text.Text at 0x10ca18450>

## K-Neighbors for Photometric Redshifts¶

In [20]:
# Author: Jake VanderPlas <[email protected]/* <![CDATA[ */!function(t,e,r,n,c,a,p){try{t=document.currentScript||function(){for(t=document.getElementsByTagName('script'),e=t.length;e--;)if(t[e].getAttribute('data-cfhash'))return t[e]}();if(t&&(c=t.previousSibling)){p=t.parentNode;if(a=c.getAttribute('data-cfemail')){for(e='',r='0x'+a.substr(0,2)|0,n=2;a.length-n;n+=2)e+='%'+('0'+('0x'+a.substr(n,2)^r).toString(16)).slice(-2);p.replaceChild(document.createTextNode(decodeURIComponent(e)),c)}p.removeChild(t)}}catch(u){}}()/* ]]> */>
#   The figure is an example from astroML: see http://astroML.github.com
import numpy as np
from matplotlib import pyplot as plt

from sklearn.neighbors import KNeighborsRegressor

from astroML.datasets import fetch_sdss_galaxy_colors
from astroML.plotting import scatter_contour

In [21]:
n_neighbors = 1

data = fetch_sdss_galaxy_colors()

N = len(data)

# shuffle data
np.random.seed(0)
np.random.shuffle(data)

# put colors in a matrix
X = np.zeros((N, 4))
X[:, 0] = data['u'] - data['g']
X[:, 1] = data['g'] - data['r']
X[:, 2] = data['r'] - data['i']
X[:, 3] = data['i'] - data['z']
z = data['redshift']

# divide into training and testing data
Ntrain = N / 2
Xtrain = X[:Ntrain]
ztrain = z[:Ntrain]

Xtest = X[Ntrain:]
ztest = z[Ntrain:]

knn = KNeighborsRegressor(n_neighbors, weights='uniform')
zpred = knn.fit(Xtrain, ztrain).predict(Xtest)

axis_lim = np.array([-0.1, 2.5])

rms = np.sqrt(np.mean((ztest - zpred) ** 2))
print "RMS error = %.2g" % rms

RMS error = 0.24

In [22]:
ax = plt.axes()
plt.scatter(ztest, zpred, c='k', lw=0, s=4)
plt.plot(axis_lim, axis_lim, '--k')
plt.plot(axis_lim, axis_lim + rms, ':k')
plt.plot(axis_lim, axis_lim - rms, ':k')
plt.xlim(axis_lim)
plt.ylim(axis_lim)

plt.text(0.99, 0.02, "RMS error = %.2g" % rms,
ha='right', va='bottom', transform=ax.transAxes,
bbox=dict(ec='w', fc='w'), fontsize=16)

plt.title('Photo-z: Nearest Neigbor Regression', fontsize=20)
plt.xlabel(r'$\mathrm{z_{spec}}$', fontsize=24)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=24)
plt.show()


## Double Gaussian Model Test¶

In [23]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
from astroML.density_estimation import GaussianMixture1D

#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX.  This may
# result in an error if LaTeX is not installed on your system.  In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=18, usetex=True)

#------------------------------------------------------------
# Generate the data
mu1_in = 0
sigma1_in = 0.3
mu2_in = 1
sigma2_in = 1
ratio_in = 1.5
N = 200

np.random.seed(10)
gm = GaussianMixture1D([mu1_in, mu2_in],
[sigma1_in, sigma2_in],
[ratio_in, 1])
x_sample = gm.sample(N)

#------------------------------------------------------------
# Get the MLE fit for a single gaussian
sample_mu = np.mean(x_sample)
sample_std = np.std(x_sample, ddof=1)

In [24]:
#------------------------------------------------------------
# Plot the sampled data
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(x_sample, 20, histtype='stepfilled', normed=True, fc='#CCCCCC')
x = np.linspace(-2.1, 4.1, 1000)

factor1 = ratio_in / (1. + ratio_in)
factor2 = 1. / (1. + ratio_in)

ax.plot(x, gm.pdf(x), '-k', label='true distribution')
ax.plot(x, gm.pdf_individual(x), ':k')

ax.plot(x, norm.pdf(x, sample_mu, sample_std), '--k', label='best fit normal')

ax.legend(loc=1)

ax.set_xlim(-2.1, 4.1)

ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')
ax.set_title('Input pdf and sampled data')
ax.text(0.95, 0.80, ('$\mu_1 = 0;\ \sigma_1=0.3$\n'
'$\mu_2=1;\ \sigma_2=1.0$\n'
'$\mathrm{ratio}=1.5$'),
transform=ax.transAxes, ha='right', va='top')
plt.show()

/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/utils/__init__.py:93: DeprecationWarning: Function eval is deprecated; GMM.eval was renamed to GMM.score_samples in 0.14 and will be removed in 0.16.
warnings.warn(msg, category=DeprecationWarning)
/Library/Frameworks/EPD64.framework/Versions/7.2/lib/python2.7/site-packages/sklearn/utils/__init__.py:93: DeprecationWarning: Function eval is deprecated; GMM.eval was renamed to GMM.score_samples in 0.14 and will be removed in 0.16.
warnings.warn(msg, category=DeprecationWarning)

In [ ]: