import numpy as np # for calculation purpose, let use np.array
import random # for the random
x = np.array([1, 3, 2, 5])
print(x)
[1 3 2 5]
x = np.array([1, 6, 2])
print(x)
[1 6 2]
y = np.array([1, 4, 3])
len(x)
3
len(y)
3
print(x + y)
[ 2 10 5]
%whos
Variable Type Data/Info ------------------------------- np module <module 'numpy' from '/op<...>kages/numpy/__init__.py'> random module <module 'random' from '/o<...>lib/python3.6/random.py'> x ndarray 3: 3 elems, type `int64`, 24 bytes y ndarray 3: 3 elems, type `int64`, 24 bytes
del x # reset_selective x
%whos
Variable Type Data/Info ------------------------------- np module <module 'numpy' from '/op<...>kages/numpy/__init__.py'> random module <module 'random' from '/o<...>lib/python3.6/random.py'> y ndarray 3: 3 elems, type `int64`, 24 bytes
reset?
x = np.array([1, 2, 3, 4])
x = np.reshape(x, (2, 2), order='F')
print(x)
[[1 3] [2 4]]
x = np.array([1, 2, 3, 4])
x = np.reshape(x, (2, 2))
print(x)
[[1 2] [3 4]]
x = np.matrix([[1, 2], [3, 4]])
print(x)
[[1 2] [3 4]]
print(np.sqrt(x))
[[ 1. 1.41421356] [ 1.73205081 2. ]]
print(x**2)
[[ 7 10] [15 22]]
print(np.square(x))
[[ 1 4] [ 9 16]]
mu, sigma = 0, 1 # mean and standard deviation
x = np.random.normal(mu, sigma, 50)
y = x + np.random.normal(50, 0.1, 50)
print(np.corrcoef(x, y))
[[ 1. 0.99496309] [ 0.99496309 1. ]]
np.random.seed(1303)
print(np.random.normal(mu, sigma, 50)) # after set up the seed, this should genernate the same result
[-0.03425693 0.06035959 0.45511859 -0.36593175 -1.6773304 0.5910023 0.41090101 0.46972388 -1.50462476 -0.70082238 1.43196963 0.35474484 1.67574682 1.62741373 0.27015354 0.15248539 0.11593596 0.89272237 -2.16627436 0.26787192 0.36658207 2.72335408 0.44060293 0.36036757 0.38119264 -0.27845602 1.73458476 -1.48138111 -0.47556927 -0.1932596 0.68115816 -0.05143463 -0.59151688 0.02292374 -0.12259196 0.50633508 0.63181139 -0.2443932 0.39847385 -1.2716468 0.43167303 -1.36491646 0.91004701 0.65707308 -0.080445 -1.12057881 -1.31479423 0.26394714 -0.59459381 -0.07624482]
np.random.seed(1303)
y = np.random.normal(mu, sigma, 100)
print(np.mean(y))
-0.0340632704471
print(np.var(y))
0.875615101383
print(np.sqrt(np.var(y)))
0.935743074451
print(np.std(y))
0.935743074451
# In python, matplotlib is the most used library for plot
# matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB.
import matplotlib.pyplot as plt
%matplotlib inline
x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)
plt.plot(x, y, 'bo') # please use plt.plot? to look at more options
plt.ylabel("this is the y-axis")
plt.xlabel("this is the x-axis")
plt.title("Plot of X vs Y")
plt.savefig('../../../output/Figure.pdf') # use plt.savefig function to save images
plt.show()
x = np.arange(1, 11) # note the arange excludes right end of range specification
print(x)
[ 1 2 3 4 5 6 7 8 9 10]
# in order to use Pi, math module needs to be loaded first
import math
x = np.linspace(-math.pi, math.pi, num = 50)
print(x)
[-3.14159265 -3.01336438 -2.88513611 -2.75690784 -2.62867957 -2.5004513 -2.37222302 -2.24399475 -2.11576648 -1.98753821 -1.85930994 -1.73108167 -1.60285339 -1.47462512 -1.34639685 -1.21816858 -1.08994031 -0.96171204 -0.83348377 -0.70525549 -0.57702722 -0.44879895 -0.32057068 -0.19234241 -0.06411414 0.06411414 0.19234241 0.32057068 0.44879895 0.57702722 0.70525549 0.83348377 0.96171204 1.08994031 1.21816858 1.34639685 1.47462512 1.60285339 1.73108167 1.85930994 1.98753821 2.11576648 2.24399475 2.37222302 2.5004513 2.62867957 2.75690784 2.88513611 3.01336438 3.14159265]
y = x
X, Y = np.meshgrid(x,y)
f = np.cos(Y)/(1 + np.square(X))
CS = plt.contour(X, Y, f)
plt.show()
fa = (f - f.T)/2 #f.T for transpose or tranpose(f)
plt.imshow(fa, extent=(x[0], x[-1], y[0], y[-1]))
plt.show()
from mpl_toolkits.mplot3d import axes3d
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.plot_wireframe(X, Y, fa)
plt.show()
A = np.arange(1,17,1).reshape(4, 4).transpose()
print(A)
[[ 1 5 9 13] [ 2 6 10 14] [ 3 7 11 15] [ 4 8 12 16]]
# R starts the index from 1, but Python starts the index from 0.
# To select the same number (10) as the book did, we need to reduce the index by 1
print(A[1, 2])
10
print(A[[[0],[2]], [1,3]])
[[ 5 13] [ 7 15]]
print(A[0:3, 1:4])
[[ 5 9 13] [ 6 10 14] [ 7 11 15]]
print(A[0:2, :])
[[ 1 5 9 13] [ 2 6 10 14]]
print(A[:, 0:2])
[[1 5] [2 6] [3 7] [4 8]]
print(A[0,:])
[ 1 5 9 13]
# minus sign has a different meaning in Python.
# This means index from the end.
# -1 means the last element
print(A[-1, -1])
16
A.shape
(4, 4)
# In Python, Pandas is a common used module to read from file into a data frame.
import pandas as pd
Auto = pd.read_csv('../../../data/Auto.csv', header=0, na_values='?')
Auto.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | ford torino |
Auto.shape
(397, 9)
Auto.iloc[32]
mpg 25 cylinders 4 displacement 98 horsepower NaN weight 2046 acceleration 19 year 71 origin 1 name ford pinto Name: 32, dtype: object
Auto.iloc[:4, :2]
mpg | cylinders | |
---|---|---|
0 | 18.0 | 8 |
1 | 15.0 | 8 |
2 | 18.0 | 8 |
3 | 16.0 | 8 |
Auto.columns
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name'], dtype='object')
list(Auto)
['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
# Use .isnull and .sum to find out how many NaNs in each variables
Auto.isnull().sum()
mpg 0 cylinders 0 displacement 0 horsepower 5 weight 0 acceleration 0 year 0 origin 0 name 0 dtype: int64
# There are 397 rows in the data and only 5 with missing values.
# We can just drop the ones with missing values.
Auto = Auto.dropna()
Auto.shape
(392, 9)
plt.plot(Auto.cylinders, Auto.mpg, 'ro')
plt.show()
Auto.hist(column = ['cylinders', 'mpg'])
plt.show()
import seaborn as sns
sns.pairplot(Auto)
plt.show()
sns.pairplot(Auto, vars = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration'])
plt.show()
Auto.describe()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
---|---|---|---|---|---|---|---|---|
count | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 |
mean | 23.445918 | 5.471939 | 194.411990 | 104.469388 | 2977.584184 | 15.541327 | 75.979592 | 1.576531 |
std | 7.805007 | 1.705783 | 104.644004 | 38.491160 | 849.402560 | 2.758864 | 3.683737 | 0.805518 |
min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
25% | 17.000000 | 4.000000 | 105.000000 | 75.000000 | 2225.250000 | 13.775000 | 73.000000 | 1.000000 |
50% | 22.750000 | 4.000000 | 151.000000 | 93.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 |
75% | 29.000000 | 8.000000 | 275.750000 | 126.000000 | 3614.750000 | 17.025000 | 79.000000 | 2.000000 |
max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
Auto.describe(include = 'all')
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
count | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 301 |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | toyota corolla |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5 |
mean | 23.445918 | 5.471939 | 194.411990 | 104.469388 | 2977.584184 | 15.541327 | 75.979592 | 1.576531 | NaN |
std | 7.805007 | 1.705783 | 104.644004 | 38.491160 | 849.402560 | 2.758864 | 3.683737 | 0.805518 | NaN |
min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 | NaN |
25% | 17.000000 | 4.000000 | 105.000000 | 75.000000 | 2225.250000 | 13.775000 | 73.000000 | 1.000000 | NaN |
50% | 22.750000 | 4.000000 | 151.000000 | 93.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 | NaN |
75% | 29.000000 | 8.000000 | 275.750000 | 126.000000 | 3614.750000 | 17.025000 | 79.000000 | 2.000000 | NaN |
max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 | NaN |
# Change the cylinders into categorical variable
Auto['cylinders'] = Auto['cylinders'].astype('category')
Auto.describe(include= 'all')
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
count | 392.000000 | 392.0 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392 |
unique | NaN | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | 301 |
top | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | toyota corolla |
freq | NaN | 199.0 | NaN | NaN | NaN | NaN | NaN | NaN | 5 |
mean | 23.445918 | NaN | 194.411990 | 104.469388 | 2977.584184 | 15.541327 | 75.979592 | 1.576531 | NaN |
std | 7.805007 | NaN | 104.644004 | 38.491160 | 849.402560 | 2.758864 | 3.683737 | 0.805518 | NaN |
min | 9.000000 | NaN | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 | NaN |
25% | 17.000000 | NaN | 105.000000 | 75.000000 | 2225.250000 | 13.775000 | 73.000000 | 1.000000 | NaN |
50% | 22.750000 | NaN | 151.000000 | 93.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 | NaN |
75% | 29.000000 | NaN | 275.750000 | 126.000000 | 3614.750000 | 17.025000 | 79.000000 | 2.000000 | NaN |
max | 46.600000 | NaN | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 | NaN |