2.3 Lab: Introduction to Python¶

2.3.1 Basic Commands¶

In [1]:

import numpy as np  # for calculation purpose, let use np.array 
import random # for the random 

In [2]:

x = np.array([1, 3, 2, 5])

In [3]:

print(x)

[1 3 2 5]

In [4]:

x = np.array([1, 6, 2])

In [5]:

print(x)

[1 6 2]

In [6]:

y = np.array([1, 4, 3])

In [7]:

len(x)

Out[7]:

In [8]:

len(y)

Out[8]:

In [9]:

print(x + y)

[ 2 10  5]

In [10]:

%whos

Variable   Type       Data/Info
-------------------------------
np         module     <module 'numpy' from '/op<...>kages/numpy/__init__.py'>
random     module     <module 'random' from '/o<...>lib/python3.6/random.py'>
x          ndarray    3: 3 elems, type `int64`, 24 bytes
y          ndarray    3: 3 elems, type `int64`, 24 bytes

In [11]:

del x # reset_selective x

In [12]:

%whos

Variable   Type       Data/Info
-------------------------------
np         module     <module 'numpy' from '/op<...>kages/numpy/__init__.py'>
random     module     <module 'random' from '/o<...>lib/python3.6/random.py'>
y          ndarray    3: 3 elems, type `int64`, 24 bytes

In [13]:

reset?

In [14]:

x = np.array([1, 2, 3, 4])
x = np.reshape(x, (2, 2), order='F')

In [15]:

print(x)

[[1 3]
 [2 4]]

In [16]:

x = np.array([1, 2, 3, 4])
x = np.reshape(x, (2, 2))

In [17]:

print(x)

[[1 2]
 [3 4]]

In [18]:

x = np.matrix([[1, 2], [3, 4]])

In [19]:

print(x)

[[1 2]
 [3 4]]

In [20]:

print(np.sqrt(x))

[[ 1.          1.41421356]
 [ 1.73205081  2.        ]]

In [21]:

print(x**2)

[[ 7 10]
 [15 22]]

In [22]:

print(np.square(x))

[[ 1  4]
 [ 9 16]]

In [23]:

mu, sigma = 0, 1 # mean and standard deviation

In [24]:

x = np.random.normal(mu, sigma, 50)

In [25]:

y = x + np.random.normal(50, 0.1, 50)

In [26]:

print(np.corrcoef(x, y))

[[ 1.          0.99496309]
 [ 0.99496309  1.        ]]

In [27]:

np.random.seed(1303)

In [28]:

print(np.random.normal(mu, sigma, 50)) # after set up the seed, this should genernate the same result

[-0.03425693  0.06035959  0.45511859 -0.36593175 -1.6773304   0.5910023
  0.41090101  0.46972388 -1.50462476 -0.70082238  1.43196963  0.35474484
  1.67574682  1.62741373  0.27015354  0.15248539  0.11593596  0.89272237
 -2.16627436  0.26787192  0.36658207  2.72335408  0.44060293  0.36036757
  0.38119264 -0.27845602  1.73458476 -1.48138111 -0.47556927 -0.1932596
  0.68115816 -0.05143463 -0.59151688  0.02292374 -0.12259196  0.50633508
  0.63181139 -0.2443932   0.39847385 -1.2716468   0.43167303 -1.36491646
  0.91004701  0.65707308 -0.080445   -1.12057881 -1.31479423  0.26394714
 -0.59459381 -0.07624482]

In [29]:

np.random.seed(1303)
y = np.random.normal(mu, sigma, 100)

In [30]:

print(np.mean(y))

-0.0340632704471

In [31]:

print(np.var(y))

0.875615101383

In [32]:

print(np.sqrt(np.var(y)))

0.935743074451

In [33]:

print(np.std(y))

0.935743074451

2.3.2 Graphics¶

In [34]:

# In python, matplotlib is the most used library for plot 
# matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB.
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:

x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)

In [36]:

plt.plot(x, y, 'bo') # please use plt.plot? to look at more options 
plt.ylabel("this is the y-axis")
plt.xlabel("this is the x-axis")
plt.title("Plot of X vs Y")
plt.savefig('../../../output/Figure.pdf') # use plt.savefig function to save images
plt.show()

In [37]:

x = np.arange(1, 11) # note the arange excludes right end of range specification

In [38]:

print(x)

[ 1  2  3  4  5  6  7  8  9 10]

In [39]:

# in order to use Pi, math module needs to be loaded first
import math
x = np.linspace(-math.pi, math.pi, num = 50)

In [40]:

print(x)

[-3.14159265 -3.01336438 -2.88513611 -2.75690784 -2.62867957 -2.5004513
 -2.37222302 -2.24399475 -2.11576648 -1.98753821 -1.85930994 -1.73108167
 -1.60285339 -1.47462512 -1.34639685 -1.21816858 -1.08994031 -0.96171204
 -0.83348377 -0.70525549 -0.57702722 -0.44879895 -0.32057068 -0.19234241
 -0.06411414  0.06411414  0.19234241  0.32057068  0.44879895  0.57702722
  0.70525549  0.83348377  0.96171204  1.08994031  1.21816858  1.34639685
  1.47462512  1.60285339  1.73108167  1.85930994  1.98753821  2.11576648
  2.24399475  2.37222302  2.5004513   2.62867957  2.75690784  2.88513611
  3.01336438  3.14159265]

In [41]:

y = x
X, Y = np.meshgrid(x,y)

In [42]:

f = np.cos(Y)/(1 + np.square(X))
CS = plt.contour(X, Y, f)
plt.show()

In [43]:

fa = (f - f.T)/2 #f.T for transpose or tranpose(f)
plt.imshow(fa, extent=(x[0], x[-1], y[0], y[-1])) 
plt.show()

In [44]:

from mpl_toolkits.mplot3d import axes3d
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.plot_wireframe(X, Y, fa)
plt.show()

2.3.3 Indexing Data¶

In [45]:

A = np.arange(1,17,1).reshape(4, 4).transpose()

In [46]:

print(A)

[[ 1  5  9 13]
 [ 2  6 10 14]
 [ 3  7 11 15]
 [ 4  8 12 16]]

In [47]:

# R starts the index from 1, but Python starts the index from 0.
# To select the same number (10) as the book did, we need to reduce the index by 1
print(A[1, 2])

In [48]:

print(A[[[0],[2]], [1,3]])

[[ 5 13]
 [ 7 15]]

In [49]:

print(A[0:3, 1:4])

[[ 5  9 13]
 [ 6 10 14]
 [ 7 11 15]]

In [50]:

print(A[0:2, :])

[[ 1  5  9 13]
 [ 2  6 10 14]]

In [51]:

print(A[:, 0:2])

[[1 5]
 [2 6]
 [3 7]
 [4 8]]

In [52]:

print(A[0,:])

[ 1  5  9 13]

In [53]:

# minus sign has a different meaning in Python.
# This means index from the end.
# -1 means the last element
print(A[-1, -1])

In [54]:

A.shape

Out[54]:

(4, 4)

2.3.4 Loading Data¶

In [55]:

# In Python, Pandas is a common used module to read from file into a data frame.
import pandas as pd 
Auto = pd.read_csv('../../../data/Auto.csv', header=0, na_values='?')

In [56]:

Auto.head()

Out[56]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	1	ford torino

In [57]:

Auto.shape

Out[57]:

(397, 9)

In [58]:

Auto.iloc[32]

Out[58]:

mpg                     25
cylinders                4
displacement            98
horsepower             NaN
weight                2046
acceleration            19
year                    71
origin                   1
name            ford pinto
Name: 32, dtype: object

In [59]:

Auto.iloc[:4, :2]

Out[59]:

	mpg	cylinders
0	18.0	8
1	15.0	8
2	18.0	8
3	16.0	8

In [60]:

Auto.columns

Out[60]:

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')

In [61]:

list(Auto)

Out[61]:

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'year',
 'origin',
 'name']

In [62]:

# Use .isnull and .sum to find out how many NaNs in each variables
Auto.isnull().sum()

Out[62]:

mpg             0
cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [63]:

# There are 397 rows in the data and only 5 with missing values.
# We can just drop the ones with missing values.
Auto = Auto.dropna()

In [64]:

Auto.shape

Out[64]:

(392, 9)

2.3.5 Additional Graphical and Numerical Summaries¶

In [65]:

plt.plot(Auto.cylinders, Auto.mpg, 'ro')
plt.show()

In [66]:

Auto.hist(column = ['cylinders', 'mpg'])
plt.show()

In [67]:

import seaborn as sns

In [68]:

sns.pairplot(Auto)
plt.show()

In [69]:

sns.pairplot(Auto, vars = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration'])
plt.show()

In [70]:

Auto.describe()

Out[70]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000
mean	23.445918	5.471939	194.411990	104.469388	2977.584184	15.541327	75.979592	1.576531
std	7.805007	1.705783	104.644004	38.491160	849.402560	2.758864	3.683737	0.805518
min	9.000000	3.000000	68.000000	46.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.000000	4.000000	105.000000	75.000000	2225.250000	13.775000	73.000000	1.000000
50%	22.750000	4.000000	151.000000	93.500000	2803.500000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	275.750000	126.000000	3614.750000	17.025000	79.000000	2.000000
max	46.600000	8.000000	455.000000	230.000000	5140.000000	24.800000	82.000000	3.000000

In [71]:

Auto.describe(include = 'all')

Out[71]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	301
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	toyota corolla
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5
mean	23.445918	5.471939	194.411990	104.469388	2977.584184	15.541327	75.979592	1.576531	NaN
std	7.805007	1.705783	104.644004	38.491160	849.402560	2.758864	3.683737	0.805518	NaN
min	9.000000	3.000000	68.000000	46.000000	1613.000000	8.000000	70.000000	1.000000	NaN
25%	17.000000	4.000000	105.000000	75.000000	2225.250000	13.775000	73.000000	1.000000	NaN
50%	22.750000	4.000000	151.000000	93.500000	2803.500000	15.500000	76.000000	1.000000	NaN
75%	29.000000	8.000000	275.750000	126.000000	3614.750000	17.025000	79.000000	2.000000	NaN
max	46.600000	8.000000	455.000000	230.000000	5140.000000	24.800000	82.000000	3.000000	NaN

In [72]:

# Change the cylinders into categorical variable
Auto['cylinders'] = Auto['cylinders'].astype('category')

In [73]:

Auto.describe(include= 'all')

Out[73]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
count	392.000000	392.0	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392
unique	NaN	5.0	NaN	NaN	NaN	NaN	NaN	NaN	301
top	NaN	4.0	NaN	NaN	NaN	NaN	NaN	NaN	toyota corolla
freq	NaN	199.0	NaN	NaN	NaN	NaN	NaN	NaN	5
mean	23.445918	NaN	194.411990	104.469388	2977.584184	15.541327	75.979592	1.576531	NaN
std	7.805007	NaN	104.644004	38.491160	849.402560	2.758864	3.683737	0.805518	NaN
min	9.000000	NaN	68.000000	46.000000	1613.000000	8.000000	70.000000	1.000000	NaN
25%	17.000000	NaN	105.000000	75.000000	2225.250000	13.775000	73.000000	1.000000	NaN
50%	22.750000	NaN	151.000000	93.500000	2803.500000	15.500000	76.000000	1.000000	NaN
75%	29.000000	NaN	275.750000	126.000000	3614.750000	17.025000	79.000000	2.000000	NaN
max	46.600000	NaN	455.000000	230.000000	5140.000000	24.800000	82.000000	3.000000	NaN