import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
# embed static images in the ipynb
%matplotlib inline
typing
¶from typing import List, Set, Dict, Tuple, Optional
# primitive types
x: int = 1
x: float = 1.0
x: bool = True
x: str = "test"
x: bytes = b"test"
# Collection types
x: Set[int] = {1, 2}
x: List[int] = [0]
x: Dict[str, float] = {'credit': 705.0}
x: Tuple[str, ...] = ('a', 'b', 'c')
# annotation for function declarations
from typing import Callable, Iterable, Union, Optional
def fn(n: int, m: int = 2) -> str:
return str(n + m)
x: Callable[[int, int], str] = fn
# Anything that we can use a for loop on is an 'Iterable'
def generate_up_to(n: int) -> Iterable[int]:
i = 0
while i <= n:
yield i
i += 1
from typing import Generator
# A generator (function with yield statement) is a specific kind of Iterable.
def generate_up_to(n: int) -> Generator[str]:
i = 0
while i <= n:
yield str(i)
i += 1
zip
vs. enumerate
¶letters = ['a', 'b', 'c']
nums = [0, 1, 2]
# Goal: Create (letter, int) pairs
# Without zip
for idx, letter in enumerate(letters):
num = nums[idx]
print(f"{letter} {num}")
# With zip
for num, letter in zip(nums, letters):
print(f"{letter} {num}")
a 0 b 1 c 2 a 0 b 1 c 2
What's the advantage of using zip?
zip
is a function that takes in iterables as values and produces a value which is itself an iterable.Note that the two collections have to have the same number of elements for zip
. One advantage of using enumerate
to keep an index is that the number of elements won't need to match for the two collections.
Since zip
accepts a stream of pairs, we can use it to generate dictionaries:
letters = ['a', 'b', 'c']
nums = [0, 1, 2]
dict(zip(letters, nums))
{'a': 0, 'b': 1, 'c': 2}
import numpy as np
buildings = ['b' + str(i) for i in np.arange(10)]
heights = np.random.random(10)
bh: dict = dict(zip(buildings, heights))
# Print the height of the tallest building in 'bh'
print(max(bh.values()))
# Print the building-height pair corresponding to the tallest building
print(max(bh.items(), key=lambda b: b[1])) # max of the seq sorted by seq[1]
# Print the name of the tallest building
print(max(bh, key=bh.get))
0.9906713395043445 ('b6', 0.9906713395043445) b6
# Create object to loop through.
import numpy as np
n_rows = 3
M: np.ndarray = np.arange(n_rows**2).reshape(n_rows, -1)
# Create generator to replace nested loop.
def element_generator(arr):
assert arr.ndim == 2
for i, row in enumerate(arr):
for j, element in enumerate(row):
idx = [i, j]
yield idx, element
# Exiting the generator loop only requires 1 break statement.
for idx, element in element_generator(M):
print(f"{element**2}\t{idx}")
if idx == [1, 1]:
break
0 [0, 0] 1 [0, 1] 4 [0, 2] 9 [1, 0] 16 [1, 1]
os
and sys
¶The os
library is for operating system dependent functionality, whereas sys
is for functionality related to interactions between the program and the Python interpreter. We manipulate the Python runtime environment with sys
.
Mine for commands: link
import os
# returns the current working director as a string
os.getcwd()
# Check if a path exists
path: str = os.getcwd()
os.path.exists(path)
# Joining paths with strings
os.path.join(path, "..", "Website")
os.path.exists( os.path.join(path, "..", "Website") )
When you use the import
statement, a module can be loaded from the "PYTHONPATH" environment variable, the current working directory, or other directories configured when Python was installed.
In order to access (and add to) the available Python system paths, we use the sys
library.
import sys
sys.path # module search paths
Since sys.path
is a list, it can be appended. This means modules can be installed from any desired folder by means of sys.path.append(some_path)
.
You can display the Python version either from the terminal or using the sys
library.
!python --version
Python 3.8.5
The above cell uses the Jupyter magic command for shell inputs. To accomplish the same thing outside of a Jupyter notebook, you'd either enter text directly or, equivalently, use the os
library.
import os
os.system("python --version")
os.system("cd") # Note that these commands output to the terminal
0
# Display Python version with 'sys'
import sys
sys.version
'3.8.5 (default, Sep 3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]'
You have a few options for this. Try, dir()
and vars()
:
collections.namedtuple
¶# collections.namedtuple
import collections
from typing import NamedTuple
def make_car(brand, fuel_efficiency, color) -> NamedTuple:
if brand == "Tesla":
Car = collections.namedtuple(
typename="Car", field_names=["brand", "miles_per_kWh", "color"])
else:
Car = collections.namedtuple(
typename="Car", field_names=["brand", "mpg", "color"])
return Car(brand, fuel_efficiency, color)
black_model_X = make_car("Tesla", 2.86, "black")
# NamedTuple have customizable attributes.
print(black_model_X.brand, black_model_X.miles_per_kWh, black_model_X.color)
print(black_model_X)
black_model_X._asdict() # return as dict
Tesla 2.86 black Car(brand='Tesla', miles_per_kWh=2.86, color='black')
{'brand': 'Tesla', 'miles_per_kWh': 2.86, 'color': 'black'}
import sklearn.datasets
obj = sklearn.datasets.load_iris()
print(f"dir(obj):\t{dir(obj)}\nfor object type:\t{type(obj)}")
sorted([1, 4, 3, 2])
sorted(range(4, 0, -1))
letters = ['a', 'abcd', 'ab', 'abc', 'cb', 'defg', 'def', 'e']
sorted(letters, key=len) # sorts based on output of len()
Toy datasets in sci-kit learn come from sklearn.datasets
.
You have a few options for this. Try, dir()
and vars()
:
import sklearn.datasets
obj = sklearn.datasets.load_iris()
print(f"dir(obj):\t{dir(obj)}\nfor object type:\t{type(obj)}")
import sklearn.datasets
sklearn_dataset = sklearn.datasets.load_breast_cancer()
dir(sklearn_dataset) # Display attributes of sklearn.utils.Bunch object
type(sklearn_dataset)
# sklearn.utils.Bunch description
type(sklearn_dataset.DESCR)
print(sklearn_dataset.DESCR)
sklearn_dataset.data
sklearn.datasets
dataset into a pd.DataFrame
.¶from sklearn.datasets import load_boston
import pandas as pd
boston = load_boston()
boston_df = pd.DataFrame(boston.data) # Set column indices as feature names
boston_df.columns = boston.feature_names
boston_df['PRICE'] = boston.target # Specify 'PRICE' as the target variable
boston_df.head()
# Convert a dataset from 'sklearn.datasets' into a pd.DataFrame.
from sklearn import datasets
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def convert_boston_to_df():
boston = datasets.load_boston()
boston_df = pd.DataFrame(
data = boston.data, columns = boston.feature_names)
boston_df['PRICE'] = boston.target # Specify 'PRICE' as the target variable
return boston_df
def correlation_matrix(df, plot: bool, digits: int = 2) -> pd.DataFrame:
"""Plot correlation matrix from a pd.DataFrame"""
corr_df = df.corr().round(digits)
if plot:
fig = plt.figure(figsize=(16,12))
ax = sns.heatmap(correlation_matrix, annot=True, cmap='Blues')
plt.show()
return corr_df
correlation_matrix(df = convert_boston_to_df(), plot=True)
import pandas as pd
pd.DataFrame?
# Generate simple random walk plot
rng = np.random.RandomState(1)
x = np.linspace(0, 10, 500)
y = np.cumsum(rng.randn(500, 6), 0) # sample 500 by 6 array from std normal dist
plt.plot(x, y)
Q: What does the np.cumsum
method do?
Return the cumulative sum of the elements of a
np.ndarray
along a given axis.
Q: What does the np.linspace
method do?
Return a
np.ndarray
of evenly spaced numbers over a specified interval.
# plot sine and cosine graphs from -pi to pi
x = np.linspace(start=-np.pi, stop=np.pi, num=100)
fig = plt.figure()
plt.plot(x, np.sin(x), '-')
plt.plot(x, np.cos(x), '--')
Q: What is the type of plt.figure
?
function
| a method inside the matplotlib.pyplot moduleQ: What does the plt.figure
method do?
Q: The figure object generated by plt.figure()
has what type?
mpl.figure.Figure
fig = plt.figure()
type(fig)
Q: Save the figure to the working directory as a png.
# plot sine and cosine graphs from -pi to pi
x = np.linspace(start=-np.pi, stop=np.pi, num=100)
fig = plt.figure()
plt.plot(x, np.sin(x), '-')
plt.plot(x, np.cos(x), '--')
fig.savefig('trig_functions.png')
Q: Use markdown to confirm that the image, 'trig_functions.png', is saved in the working directory.

Q: Use a Jupyter module to confirm that the image, 'trig_functions.png', is saved in the working directory.
from IPython.display import Image
Image("trig_functions.png")
cloze:
mpl.figure.Figure.savefig()
method, the file format is inferred from the extension of the given filename.# Display supported file types for the mpl.figure.Figure.savefig() method.
fig.canvas.get_supported_filetypes()
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(8, 12))
x = np.linspace(start=-10, stop=10, num=100)
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x))
Q: What's the type of ax
?
AxesSubplot
fig, ax = plt.subplots()
type(ax)
Q: What's the type of fig
?
type(fig)
# simple sinusoids using figure and axes instances
fig, ax = plt.subplots()
x = np.linspace(start=0, stop=4*np.pi)
ax.plot(x, np.sin(x))
ax.plot(x, np.cos(x))
print(type(fig), type(ax))
fig, ax = plt.subplots()
x = np.linspace(start=0, stop=4*np.pi)
# Line colors
ax.plot(x, np.sin(x - 0), color='blue') # specify color by name
ax.plot(x, np.sin(x - 1), color='g') # short color code (rgbcmyk)
ax.plot(x, np.sin(x - 2), color='0.75') # Grayscale between 0 and 1
ax.plot(x, np.sin(x - 3), color='#FFDD44') # Hex code (RRGGBB from 00 to FF)
ax.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values 0 and 1
ax.plot(x, np.sin(x - 5), color='chartreuse'); # all HTML color names supported
rgbcmyk: cmyk stands for Cyan, Magenta, Yellow, blacK
fig, ax = plt.subplots()
x = np.linspace(start=0, stop=4*np.pi)
# Line styles
ax.plot(x, x + 0, linestyle='solid')
ax.plot(x, x + 1, linestyle='dashed')
ax.plot(x, x + 2, linestyle='dashdot')
ax.plot(x, x + 3, linestyle='dotted');
# shorthand for line styles
ax.plot(x, x + 4, linestyle='-') # solid
ax.plot(x, x + 5, linestyle='--') # dashed
ax.plot(x, x + 6, linestyle='-.') # dashdot
ax.plot(x, x + 7, linestyle=':'); # dotted
# Set multiple properties of an AxesSubplot with ax.set() method
x = np.linspace(start=0, stop=4*np.pi)
ax = plt.axes()
ax.plot(x, np.sin(x))
ax.set(xlim=(0,4*np.pi), ylim=(-1, 1),
xlabel='x', ylabel='sin(x)',
title='Simple Sinusoid');
fig, ax_array = plt.subplots(nrows=4, ncols=3, figsize=(10,10))
fig.tight_layout() # prevents subplots from overlapping
count = 0
shift = 0
for row in ax_array:
for axes in row:
axes.plot(x, np.sin(x + shift))
axes.set(title='t='+ str(count))
count += 1
shift += np.pi / 2
# currently in Section 4.2.3: Labeling Plots
def check_null_values(df):
"""Check if there are missing values in a pd.DataFrame."""
n_col = len(df.columns)
null_counts = np.array(df.isnull().sum())
if np.any(null_counts) == False:
print("The DataFrame has no null values.")
else:
return df.isnull().sum()
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')
def tt_split(data):
# Specify feature and target matrix
if isinstance(data, pd.DataFrame):
X, Y = data.iloc[:, :4], data.iloc[:, -1]
elif isinstance(data, np.ndarray):
X, Y = data[:, :4], data[:, -1].astype(float)
# Perform train-test split
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.3, random_state=7)
return X_train, X_test, Y_train, Y_test
X_train, X_test, Y_train, Y_test = tt_split(df)
from sklearn import preprocessing
def scale_data(X_train, X_test, type="standard"):
scaler = (preprocessing.StandardScaler()
if type == "standard"
else preprocessing.MinMaxScaler())
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
# Gives minimum of 0, max of 1
X_train, X_test = scale_data(X_train, X_test, "minmax")
pd.DataFrame(X_train).describe()
# Gives mean of 0, standard dev of 1
X_train, X_test = scale_data(X_train, X_test, "standard")
pd.DataFrame(X_train).describe()
from sklearn import metrics
# metrics.auc?
# metrics.roc_auc_score?
# Note: The 'torch.topk' method accomplishes this task
import numpy as np
def get_index_n_highest(a: np.ndarray, n: int):
"""
Arguments:
a (np.ndarray, 1D): input vector
n (int): how many of the highest indices you want
Returns:
highest_indices (np.ndarray): array containing the inidices of
the n highest elements
Examples:
--------
Return the indices of the rows containing the 6 highest values in
the last column of A, a 2x2 matrix.
>>> import numpy as np
>>> import copy
>>> rng = np.random.RandomState(7)
>>> A = rng.randint(low=1, high=100, size=(25,25))
>>> last_col = A[:,-1].copy()
>>> get_index_n_highest(last_col, 6)
array([12 8 23 13 21 20])
>>> last_col[highest_indices]
array([92, 90, 88, 85, 83, 77])
"""
highest_indices = a.argsort()[-n:][::-1]
return highest_indices
import numpy as np
rng = np.random.RandomState()
M = rng.random((3,4))
np.min(M,v axis=1)
import numpy as np
# Let's assume you have some data matrix X.
rng = np.random.RandomState()
X: np.ndarray = rng.randint(low=0, high=5, size=9).reshape(3, -1)
assert X.ndim == 2, "'X' is a matrix."
# column-wise centering
col_means = np.mean(X, axis=0)
for col_idx, col_vals in enumerate(X.T):
X[:, col_idx] -= col_means[col_idx]
# row-wise centering
row_means = np.mean(X, axis=1)
for row_idx, row_vals in enumerate(X):
X[row_idx, :] -= row_means[row_idx]
The goal of principal component analysis is to compute the most meaningful basis to re-express a noisy dataset.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import decomposition
from numpy import linalg
from typing import Union, Iterable
def PCA_decomposition(X: np.ndarray, n: int = None):
"""Perform PCA decomposition on X.
Args:
X (np.ndarray): A tabular (2-D) data matrix consisting entirely
of numerical features.
n (int, optional): Number of principal components to keep for the
decomposition. If None, the decomposition is done using the maximum
possible number of components.
Returns:
(sklearn.decomposition.PCA): A PCA decomposition that has been fitted
to data matrix X.
"""
# Find the maximum valid number of principal components
n_components: int = linalg.matrix_rank(X)
# Perform PCA decomposition on X
if n is not None:
if not isinstance(n, int):
raise ValueError("n must be an integer.")
elif n >= n_components:
raise ValueError(f"'n' value of {n} is too high."
+ f"'n' can be at most {n_components}.")
pca = decomposition.PCA(components = n_components)
pca.fit(X)
return pca
def PCA_req_components(pca_X: decomposition.PCA,
threshold: Union[float, Iterable[float]] = 0.99,
plot=True, verbose=False) -> pd.DataFrame:
"""Compute the number of required components for different variance
explained values.
Args:
pca_X (decomposion.PCA): An instance of sklearn.decomposition.PCA
that has been fitted to data matrix X.
threshold (Union[float, Iterable[float]]):
plot (bool, optional): Toggles whether to display a visualization of
how the number of principal components varies with variance
explained.
Returns:
req_components (pd.DataFrame): A table containing the required number
of components for the following threshold values
[0.9, 0.95, 0.97, 0.99, 0.999].
"""
n_components: int
n_features: int
n_components, n_features = pca_X.components_
explained_variance = np.cumsum(pca_X.explained_variance_ratio_)
threshold_line = np.ones(n_components) * threshold
p_component_idxs = np.arange(n_components) + 1
if plot:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(p_component_idxs, explained_variance,
label='cumulative variance explained')
ax.plot(p_component_idxs, threshold_line, '--', label='threshold')
# ax.plot(p_components, pca.explained_variance_ratio_, 'o',
# label='individual variance explained')
ax.set(title = f"Variance Explained, n_features = {n_features}",
xlabel = "Principal components",
ylabel = "Percentage of Variance Explained")
ax.legend()
plt.show()
reduced_components: int = list(explained_variance >= threshold).index(True)
# Thresholds, required components table
thresholds = [0.9, 0.95, 0.97, 0.99, 0.999]
idx = []
for t in thresholds:
idx.append(list(explained_variance >= t).index(True))
thresholds, idx = [np.array(l) for l in [thresholds, idx]]
req_components = pd.DataFrame(np.vstack([thresholds, idx]),
index = ["threshold", "principal components"])
return req_components
def PCA_reduction(X: np.ndarray, pca_X: decomposition.PCA) -> np.ndarray:
""" Transforms a feature matrix with PCA feature reduction.
Args:
X (np.ndarray): Feature matrix
pca_X (decomposition.PCA): An sklearn.decomposition.PCA instance that
has been fitted to X.
Returns:
X_new (np.ndarray): X with PCA feature reduction.
"""
X_new = pca_X.fit_transform(X)
return X_new
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from typing import Union
from numpy import ndarray
from torch import Tensor
class SupervisedTabular(Dataset):
def __init__(self,
X: Union[ndarray, Tensor],
Y: Union[ndarray, Tensor],
kind: str = "r"):
self.X = X
self.Y = Y
self.kind = kind
self.check_for_valid_inputs()
self.convert_data_to_tensors()
self.n_samples = self.X.shape[0]
def __getitem__(self, idx):
return self.X[idx], self.Y[idx]
def __len__(self) -> int:
return self.n_samples
def check_for_valid_inputs(self):
X, Y = self.X, self.Y
assert X.ndim in [0, 1, 2], (
f"The array dimension of X is too high. X.ndim: {X.ndim}")
assert Y.ndim in [0, 1, 2], (
f"The array dimension of Y is too high. Y.ndim: {Y.ndim}")
assert X.shape[0] == Y.shape[0], (
f"X and Y have different numbers of samples. Dim 0 should match.")
assert isinstance(X, (ndarray, Tensor))
assert isinstance(Y, (ndarray, Tensor))
assert self.kind in ["c", "classification", "r", "regression"], (
f"Attribute 'kind' must be 'c' or 'r' for classification"
+" or regression.")
if self.kind in ["c", "classification"]:
self.kind = "c"
else:
self.kind = "r"
def convert_data_to_tensors(self):
X, Y = self.X, self.Y
if isinstance(X, ndarray):
self.X = torch.from_numpy(X).float()
elif isinstance(X, Tensor):
self.X = X.float()
else:
raise Exception("Impossible!")
if isinstance(Y, ndarray):
Y = Y.reshape(-1)
if self.kind == "r":
self.Y = torch.from_numpy(Y).float()
else:
self.Y = torch.from_numpy(Y).long()
elif isinstance(Y, Tensor):
Y = Y.view(-1)
if self.kind == "r":
self.Y = Y.float()
else:
self.Y = Y.long()
else:
raise Exception("Impossible!")
assert isinstance(X, (ndarray, Tensor))
# TODO: Dataloader
class MLPClassifier(nn.Module):
"""Feed Forward Neural Network (Classification)"""
def __init__(self, in_nodes: int, n_classes: int):
super(MLPClassifier, self).__init__()
hidden_dim: int = np.around(np.sqrt(in_nodes * n_classes))
self.fc_layers = nn.Sequential(
nn.Linear(in_features=in_nodes, out_features=hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, n_classes),
nn.LeakyReLU())
def forward(self, x):
logits = self.fc_layers(x)
return logits
class MLPRegressor(nn.Module):
"""Feed Forward Neural Netowrk (Regression)"""
def __init__(self, in_nodes: int):
super(MLPRegressor, self).__init__()
self.fc_layers = nn.Sequential(
nn.Linear(in_features=in_nodes, out_features=10),
nn.ReLU(),
nn.Linear(10, 5),
nn.ReLU(),
nn.Linear(5, 1),
nn.LeakyReLU())
def forward(self, x):
return self.fc_layers(x)
# TODO: Recurrent Neural Network on sequential data
# TODO: CNN for Image classification
# TODO: NLP
# TODO: Simple GANs
import torch.nn as nn
import numpy as np
np.around?
Just uncomment the next cell if you need to install.
# !pip install pytorch_lightning --quiet
import pytorch_lightning as pl
from scipy import stats
# stats.norm (norm_gen): A normal continuous random variable
mean, var = stats.norm.stats(loc=0, scale=1, moments="mv")
# loc kw specifies mean
# scale kw specifies var
mean, var, skew, kurt = stats.norm.stats(loc=0, scale=1, moments="mvsk")
stats.norm.rvs(size=5)
stats.norm?
Informed by the Janki method and my own personal research, it's clear that the interval modifier should be set in such a way that the desired mature retention rate is achieved. In the Anki manual (i.e. SuperMemo), an equation based on Ebbinghaus's forgetting curve is cited from SuperMemo as being able to calculate what setting Anki's interval modifier should have for a desired retention rate. - See docs.ankiweb.net.
import numpy as np
# Note, the arrow is a type hint: https://www.python.org/dev/peps/pep-0484/
def calcIntervalModifier(modifier_now, success_now, success_wanted=0.85) -> int:
"""Calculates the interval modifier needed for a target retention rate in Anki.
Args:
modifier_now (int): Current interval modifier.
success_wanted (real number): Desired retention rate. This can be lower
or higher than `success_now` depending on your goals.
success_now (real number): Current mature retention rate.
Returns:
modifier_new (int): Optimal interval modifier.
Examples:
--------
Suppose you have a mature accuracy of 96% for the past month in a deck.
If you interval modifier is currently set to 100%, you could acheive higher
efficiency by studying more cards and lowering the accuracy to 90%, and
even higher efficiency by going down to 80 or 85%.
>>> calcIntervalModifier(100, .96, .90)
258
The above indicates an optimal setting of 258% for the interval modifier.
>>> calcIntervalModifier(130, .95)
412
"""
modifier_new = modifier_now * np.log(success_wanted) / np.log(success_now)
modifier_new = round(modifier_new)
return modifier_new
In order to save an environment's packages and versions so that the environment can quickly be reproduced, you need the environment.yml
. To get one named "env_name", activate your environment in the conda prompt and then enter:
conda env export > env_name.yml
This will create a file in the current working directory. Then, recreate the environment from the YAML file:
conda env create -f env_name.yml
To verify the installation, enter conda env list
and then activate the env with conda activate env_name
. Entering conda info --envs
will also work.
source: docs.conda.io
Caffeine has a half-life of about 6 hours in the body for most people.
def caffeine_in_body(hours: float, pct: bool = True,
input_caff: float = None ) -> float:
"""Args:
hours (float): Time passed in hours.
pct (bool, optional): Returns caffeine in the body as a percentage
if True. Defaults to True.
input_caff (float, optional): Starting amount of caffeine in the body.
"""
if pct:
current_caffeine_pct = 0.5**(hours / 6)
return current_caff_pct
else:
if input_caff is None:
raise ValueError("'input_caff' must be >= 0.")
current_caffeine = input_caff * 0.5 **(hours / 6)
return current_caffeine