Chapter 2: Basic Operations

Vectors, Data, Matrices and Subsetting

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
x = np.array([2,7,5])    # explicit vector creation
x
Out[2]:
array([2, 7, 5])
In [3]:
y = np.arange(4, 13, 3)  # vector creation from a sequence (start, stop, step)
y
Out[3]:
array([ 4,  7, 10])
In [4]:
x + y    # vectors can be added
Out[4]:
array([ 6, 14, 15])
In [5]:
x / y    # divided
Out[5]:
array([ 0.5,  1. ,  0.5])
In [6]:
x ** y    # exponentiated
Out[6]:
array([     16,  823543, 9765625])
In [7]:
x[1]    # vector elements can be selected by position
Out[7]:
7
In [8]:
x[1:3]  # multiple elements can be selected using slices
Out[8]:
array([7, 5])
In [9]:
x[-2]  # elements can be specified as offset from end
Out[9]:
7
In [10]:
x[np.array([0,1])]  # elements can be specified as an array
Out[10]:
array([2, 7])
In [11]:
Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z                 # note: R arranges the elements column-wise
Out[11]:
matrix([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])
In [12]:
Z[2:4, 1:3]    # R is 1-based and includes ending index, Python is 0 based and does not.
Out[12]:
matrix([[ 8,  9],
        [11, 12]])
In [13]:
Z[:, 1:3]    # column slice
Out[13]:
matrix([[ 2,  3],
        [ 5,  6],
        [ 8,  9],
        [11, 12]])
In [14]:
Z.shape
Out[14]:
(4, 3)

Generating Random Data, Graphics

In [15]:
x = np.random.uniform(0.0, 1.0, 50)
x
Out[15]:
array([ 0.65268288,  0.55438063,  0.12332376,  0.73656161,  0.172588  ,
        0.6952661 ,  0.36758793,  0.42034067,  0.54835434,  0.85493252,
        0.36666796,  0.14029571,  0.16910043,  0.18956811,  0.77016189,
        0.1914512 ,  0.48202498,  0.98936099,  0.43339457,  0.3690124 ,
        0.91976653,  0.42354589,  0.23428098,  0.24846295,  0.20409963,
        0.71024602,  0.57361799,  0.12510961,  0.70550836,  0.62120022,
        0.7089459 ,  0.67670746,  0.39836928,  0.17109468,  0.57367169,
        0.60584339,  0.72445482,  0.67991423,  0.94658844,  0.91068048,
        0.65489981,  0.05985463,  0.56797862,  0.67604148,  0.989085  ,
        0.79619792,  0.46406902,  0.91227664,  0.38043679,  0.04246386])
In [16]:
y = np.random.normal(0.0, 1.0, 50)
y
Out[16]:
array([ -5.95648785e-01,   1.54445571e+00,   1.35312925e+00,
         1.88123098e+00,  -3.80353324e-01,  -9.67551972e-04,
         5.97422145e-01,   3.76114342e-02,  -1.33779968e+00,
        -5.95407697e-01,   1.27206236e+00,  -1.70114410e+00,
        -1.84147571e+00,   7.16611666e-01,   2.33006825e+00,
        -6.56709283e-01,   8.26951067e-01,   4.64854930e-01,
         8.79290232e-01,  -7.42861574e-02,   4.88797828e-01,
         1.50860241e+00,   1.07467313e+00,  -7.99843607e-01,
         6.81719603e-02,   5.06424430e-01,   2.15770333e-01,
         1.67843398e+00,  -5.01086886e-01,   1.51407152e-01,
        -3.30648417e-02,  -8.50683506e-01,  -1.56531920e+00,
        -6.43438018e-01,  -9.55514605e-01,   1.51696803e-01,
        -9.87417666e-01,  -4.44820480e-01,   1.21902188e-01,
        -5.20474443e-01,   8.31544982e-01,  -6.75527626e-01,
         1.00537660e+00,   4.87948062e-01,   3.43123002e-01,
         3.95408496e-01,   4.27046930e-01,   3.69945068e-01,
         1.36728064e+00,  -3.15495180e-01])
In [17]:
fig, ax = plt.subplots()
plt.scatter(x, y)
Out[17]:
<matplotlib.collections.PathCollection at 0x420b410>
In [18]:
fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red')  # plot customizations
Out[18]:
<matplotlib.collections.PathCollection at 0x4241f50>
In [19]:
plt.subplot(121)    # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)
Out[19]:
(array([ 3.,  1.,  9.,  5.,  9.,  9.,  6.,  3.,  4.,  1.]),
 array([-1.84147571, -1.42432131, -1.00716692, -0.59001252, -0.17285812,
        0.24429627,  0.66145067,  1.07860506,  1.49575946,  1.91291385,
        2.33006825]),
 <a list of 10 Patch objects>)

Reading in data

In [20]:
# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df = pd.read_csv("../data/Auto.csv")
auto_df.columns     # column names
Out[20]:
Index([u'mpg', u'cylinders', u'displacement', u'horsepower', u'weight', u'acceleration', u'year', u'origin', u'name'], dtype='object')
In [21]:
auto_df.shape    # number of rows, number of columns
Out[21]:
(397, 9)
In [22]:
type(auto_df)
Out[22]:
pandas.core.frame.DataFrame
In [23]:
auto_df.describe()  # equivalent of R's DataFrame.summary()
Out[23]:
mpg cylinders displacement weight acceleration year origin
count 397.000000 397.000000 397.000000 397.000000 397.000000 397.000000 397.000000
mean 23.515869 5.458438 193.532746 2970.261965 15.555668 75.994962 1.574307
std 7.825804 1.701577 104.379583 847.904119 2.749995 3.690005 0.802549
min 9.000000 3.000000 68.000000 1613.000000 8.000000 70.000000 1.000000
25% 17.500000 4.000000 104.000000 2223.000000 13.800000 73.000000 1.000000
50% 23.000000 4.000000 146.000000 2800.000000 15.500000 76.000000 1.000000
75% 29.000000 8.000000 262.000000 3609.000000 17.100000 79.000000 2.000000
max 46.600000 8.000000 455.000000 5140.000000 24.800000 82.000000 3.000000

8 rows × 7 columns

In [24]:
plt.ylabel("MPG")
auto_df.plot(x="cylinders", y="mpg", style='o')
Out[24]:
<matplotlib.axes.AxesSubplot at 0x46ef650>
In [25]:
auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders
Out[25]:
<matplotlib.axes.AxesSubplot at 0x499f510>
In [26]:
# similar to R pairs, shows correlation scatter plots between columns and distribution for each 
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")