Tutorial Setup¶

Check your install¶

In [ ]:

import numpy

In [ ]:

import scipy

In [ ]:

import matplotlib

In [ ]:

import sklearn

In [ ]:

import psutil

In [ ]:

import pandas

In [ ]:

import IPython.parallel

Finding the location of an installed package and its version:

In [ ]:

numpy.__path__

In [ ]:

numpy.__version__

Check that you have the datasets¶

In [ ]:

%run ../fetch_data.py
# %run ../fetch_data.py twenty_newsgroups sentiment140 covertype

In [ ]:

import os
for fname in os.listdir('../datasets/'):
    print(fname)

A NumPy primer¶

NumPy array dtypes and shapes¶

In [ ]:

import numpy as np

In [ ]:

a = np.array([1, 2, 3])

In [ ]:

b = np.array([[0, 2, 4], [1, 3, 5]])

In [ ]:

b.shape

In [ ]:

b.dtype

In [ ]:

a.shape

In [ ]:

a.dtype

In [ ]:

np.zeros(5)

In [ ]:

np.ones(shape=(3, 4), dtype=np.int32)

Common array operations¶

In [ ]:

c = b * 0.5

In [ ]:

c.shape

In [ ]:

c.dtype

In [ ]:

d = a + c

In [ ]:

d[0]

In [ ]:

d[0, 0]

In [ ]:

d[:, 0]

In [ ]:

d.sum()

In [ ]:

d.mean()

In [ ]:

d.sum(axis=0)

In [ ]:

d.mean(axis=1)

Reshaping and inplace update¶

In [ ]:

e = np.arange(12)

In [ ]:

f = e.reshape(3, 4)

In [ ]:

e[5:] = 0

In [ ]:

Combining arrays¶

In [ ]:

np.concatenate([a, a, a])

In [ ]:

np.vstack([a, b, d])

In [ ]:

np.hstack([b, d])

A Matplotlib primer¶

In [ ]:

%matplotlib inline

In [ ]:

import matplotlib.pyplot as plt

In [ ]:

x = np.linspace(0, 2, 10)

In [ ]:

plt.plot(x, 'o-');

In [ ]:

plt.plot(x, x, 'o-', label='linear')
plt.plot(x, x ** 2, 'x-', label='quadratic')

plt.legend(loc='best')
plt.title('Linear vs Quadratic progression')
plt.xlabel('Input')
plt.ylabel('Output');

In [ ]:

samples = np.random.normal(loc=1.0, scale=0.5, size=1000)

In [ ]:

samples.shape

In [ ]:

samples.dtype

In [ ]:

samples[:30]

In [ ]:

plt.hist(samples, bins=50);

In [ ]:

samples_1 = np.random.normal(loc=1, scale=.5, size=10000)
samples_2 = np.random.standard_t(df=10, size=10000)

In [ ]:

bins = np.linspace(-3, 3, 50)
_ = plt.hist(samples_1, bins=bins, alpha=0.5, label='samples 1')
_ = plt.hist(samples_2, bins=bins, alpha=0.5, label='samples 2')
plt.legend(loc='upper left');

In [ ]:

plt.scatter(samples_1, samples_2, alpha=0.1);

In [ ]: