from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
x = np.array([2,7,5]) # explicit vector creation
x
array([2, 7, 5])
y = np.arange(4, 13, 3) # vector creation from a sequence (start, stop, step)
y
array([ 4, 7, 10])
x + y # vectors can be added
array([ 6, 14, 15])
x / y # divided
array([ 0.5, 1. , 0.5])
x ** y # exponentiated
array([ 16, 823543, 9765625])
x[1] # vector elements can be selected by position
7
x[1:3] # multiple elements can be selected using slices
array([7, 5])
x[-2] # elements can be specified as offset from end
7
x[np.array([0,1])] # elements can be specified as an array
array([2, 7])
Z = np.matrix(np.arange(1,13)).reshape((4, 3))
Z # note: R arranges the elements column-wise
matrix([[ 1, 2, 3], [ 4, 5, 6], [ 7, 8, 9], [10, 11, 12]])
Z[2:4, 1:3] # R is 1-based and includes ending index, Python is 0 based and does not.
matrix([[ 8, 9], [11, 12]])
Z[:, 1:3] # column slice
matrix([[ 2, 3], [ 5, 6], [ 8, 9], [11, 12]])
Z.shape
(4, 3)
x = np.random.uniform(0.0, 1.0, 50)
x
array([ 0.65268288, 0.55438063, 0.12332376, 0.73656161, 0.172588 , 0.6952661 , 0.36758793, 0.42034067, 0.54835434, 0.85493252, 0.36666796, 0.14029571, 0.16910043, 0.18956811, 0.77016189, 0.1914512 , 0.48202498, 0.98936099, 0.43339457, 0.3690124 , 0.91976653, 0.42354589, 0.23428098, 0.24846295, 0.20409963, 0.71024602, 0.57361799, 0.12510961, 0.70550836, 0.62120022, 0.7089459 , 0.67670746, 0.39836928, 0.17109468, 0.57367169, 0.60584339, 0.72445482, 0.67991423, 0.94658844, 0.91068048, 0.65489981, 0.05985463, 0.56797862, 0.67604148, 0.989085 , 0.79619792, 0.46406902, 0.91227664, 0.38043679, 0.04246386])
y = np.random.normal(0.0, 1.0, 50)
y
array([ -5.95648785e-01, 1.54445571e+00, 1.35312925e+00, 1.88123098e+00, -3.80353324e-01, -9.67551972e-04, 5.97422145e-01, 3.76114342e-02, -1.33779968e+00, -5.95407697e-01, 1.27206236e+00, -1.70114410e+00, -1.84147571e+00, 7.16611666e-01, 2.33006825e+00, -6.56709283e-01, 8.26951067e-01, 4.64854930e-01, 8.79290232e-01, -7.42861574e-02, 4.88797828e-01, 1.50860241e+00, 1.07467313e+00, -7.99843607e-01, 6.81719603e-02, 5.06424430e-01, 2.15770333e-01, 1.67843398e+00, -5.01086886e-01, 1.51407152e-01, -3.30648417e-02, -8.50683506e-01, -1.56531920e+00, -6.43438018e-01, -9.55514605e-01, 1.51696803e-01, -9.87417666e-01, -4.44820480e-01, 1.21902188e-01, -5.20474443e-01, 8.31544982e-01, -6.75527626e-01, 1.00537660e+00, 4.87948062e-01, 3.43123002e-01, 3.95408496e-01, 4.27046930e-01, 3.69945068e-01, 1.36728064e+00, -3.15495180e-01])
fig, ax = plt.subplots()
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x420b410>
fig, ax = plt.subplots()
plt.xlabel("Random Uniform")
plt.ylabel("Random Normal")
plt.scatter(x, y, marker='o', color='red') # plot customizations
<matplotlib.collections.PathCollection at 0x4241f50>
plt.subplot(121) # parameter indicates 1 rows, 2 col, first figure
plt.scatter(x, y)
plt.subplot(122)
plt.hist(y)
(array([ 3., 1., 9., 5., 9., 9., 6., 3., 4., 1.]), array([-1.84147571, -1.42432131, -1.00716692, -0.59001252, -0.17285812, 0.24429627, 0.66145067, 1.07860506, 1.49575946, 1.91291385, 2.33006825]), <a list of 10 Patch objects>)
# data comes from ISL book site: http://www-bcf.usc.edu/~gareth/ISL/data.html
auto_df = pd.read_csv("../data/Auto.csv")
auto_df.columns # column names
Index([u'mpg', u'cylinders', u'displacement', u'horsepower', u'weight', u'acceleration', u'year', u'origin', u'name'], dtype='object')
auto_df.shape # number of rows, number of columns
(397, 9)
type(auto_df)
pandas.core.frame.DataFrame
auto_df.describe() # equivalent of R's DataFrame.summary()
mpg | cylinders | displacement | weight | acceleration | year | origin | |
---|---|---|---|---|---|---|---|
count | 397.000000 | 397.000000 | 397.000000 | 397.000000 | 397.000000 | 397.000000 | 397.000000 |
mean | 23.515869 | 5.458438 | 193.532746 | 2970.261965 | 15.555668 | 75.994962 | 1.574307 |
std | 7.825804 | 1.701577 | 104.379583 | 847.904119 | 2.749995 | 3.690005 | 0.802549 |
min | 9.000000 | 3.000000 | 68.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
25% | 17.500000 | 4.000000 | 104.000000 | 2223.000000 | 13.800000 | 73.000000 | 1.000000 |
50% | 23.000000 | 4.000000 | 146.000000 | 2800.000000 | 15.500000 | 76.000000 | 1.000000 |
75% | 29.000000 | 8.000000 | 262.000000 | 3609.000000 | 17.100000 | 79.000000 | 2.000000 |
max | 46.600000 | 8.000000 | 455.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
8 rows × 7 columns
plt.ylabel("MPG")
auto_df.plot(x="cylinders", y="mpg", style='o')
<matplotlib.axes.AxesSubplot at 0x46ef650>
auto_df.boxplot(column="mpg", by="cylinders") # MPG distribution by number of cylinders
<matplotlib.axes.AxesSubplot at 0x499f510>
# similar to R pairs, shows correlation scatter plots between columns and distribution for each
# column along the diagonal.
# The R version that uses formulas does not seem to have a Python equivalent (and doesn't seem
# to be very useful for exploratory analysis IMO).
axes = pd.tools.plotting.scatter_matrix(auto_df, color="brown")