import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
np.random.seed(0)
dtype_float = torch.FloatTensor
dtype_long = torch.LongTensor
Read in the data:
dat = pd.read_csv("https://raw.githubusercontent.com/salesforce/TransmogrifAI/master/helloworld/src/main/resources/IrisDataset/bezdekIris.data", header = None)
dat
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
Compute the train/test/validation split, and transform the y's into integers.
from sklearn.preprocessing import LabelEncoder
all_x = dat.loc[:, 0:3].to_numpy()
all_y_raw = dat.loc[:, 4].to_numpy()
le = LabelEncoder()
le.fit(all_y_raw)
all_y = le.transform(all_y_raw)
idx = np.random.permutation(range(all_x.shape[0]))
train_idx = idx[:60]
test_idx = idx[60:100]
valid_idx = idx[100:]
train_x = all_x[train_idx]
train_y = all_y[train_idx]
test_x = all_x[test_idx]
test_y = all_y[test_idx]
valid_x = all_x[valid_idx]
valid_y = all_y[valid_idx]
Read the train/validation/test sets into torch variables
x = Variable(torch.from_numpy(train_x), requires_grad=False).type(dtype_float)
y_classes = Variable(torch.from_numpy(train_y), requires_grad=False).type(dtype_long)
x_valid = Variable(torch.from_numpy(valid_x), requires_grad=False).type(dtype_float)
y_validclasses = Variable(torch.from_numpy(valid_y), requires_grad=False).type(dtype_long)
x_test = Variable(torch.from_numpy(test_x), requires_grad=False).type(dtype_float)
y_testclasses = Variable(torch.from_numpy(test_y), requires_grad=False).type(dtype_long)
Define a network that corrsponds to multinomial logistic regression:
dim_x = 4 # number of features: in this case 4 attributes
dim_out = 3 # number of classes we're predicting. We have 3 types of flowers
model_logreg = torch.nn.Sequential(
torch.nn.Linear(dim_x, dim_out)
)
Train the logistic regression:
loss_train = [] # loss for the training set
loss_valid = [] # loss for the validation set
learning_rate = 1e-3
N = 10000 # number of gradient descent iterations
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_logreg.parameters(), lr=learning_rate)
for t in range(N):
y_pred = model_logreg(x)
loss = loss_fn(y_pred, y_classes)
loss_train.append(loss.data.numpy().reshape((1,))[0]/len(y_classes))
y_validpred = model_logreg(x_valid)
loss_v = loss_fn(y_validpred, y_validclasses)
loss_valid.append(loss_v.data.numpy().reshape((1,))[0]/len(y_validclasses))
model_logreg.zero_grad() # Zero out the previous gradient computation
loss.backward() # Compute the gradient
optimizer.step() # Use the gradient information to
# make a step
# Problem 3: Plotting learning curve
plt.plot(range(N), loss_valid, color='b', label='validation')
plt.plot(range(N), loss_train, color='r', label='training')
plt.legend(loc='upper left')
plt.title("Learning curve for training and validation sets")
plt.xlabel("Number of iterations (N)")
plt.ylabel("Cross entropy loss")
plt.show()
There are several things to note here:
When computing the cost to display, we divided the cost for the entire training/validation set by the number of training cases. That makes the training and validation costs displayed commesurate (since the training set is of size 60 and the validation set is of size 50, we'd expect the training cost to be larger just because it's the sum of the costs for all the datapoints).
It's quite difficult to make out any difference between the training and the validation sets here. If we re-run the same code (but starting from the weights obtained at iteration 10000, since we won't redefine model_logreg
, we'll see some difference (even though it is tiny)
loss_train = [] # loss for the training set
loss_valid = [] # loss for the validation set
learning_rate = 1e-3
N = 10000 # number of gradient descent iterations
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_logreg.parameters(), lr=learning_rate)
for t in range(N):
y_pred = model_logreg(x)
loss = loss_fn(y_pred, y_classes)
loss_train.append(loss.data.numpy().reshape((1,))[0]/len(y_classes))
y_validpred = model_logreg(x_valid)
loss_v = loss_fn(y_validpred, y_validclasses)
loss_valid.append(loss_v.data.numpy().reshape((1,))[0]/len(y_validclasses))
model_logreg.zero_grad() # Zero out the previous gradient computation
loss.backward() # Compute the gradient
optimizer.step() # Use the gradient information to
# make a step
# Problem 3: Plotting learning curve
plt.plot(range(N, 2*N), loss_valid, color='b', label='validation')
plt.plot(range(N, 2*N), loss_train, color='r', label='training')
plt.legend(loc='upper left')
plt.title("Learning curve for training and validation sets")
plt.xlabel("Number of iterations (N)")
plt.ylabel("Cross entropy loss")
plt.show()
loss_train[-1]
0.0010166171938180923
Let's now define a one-hidden layer neural network with 3 hidden units:
dim_x = 4 # number of features: in this case 4 attributes
dim_h = 3 # number of hidden units
dim_out = 3 # number of classes we're predicting. We have 3 types of flowers
model_ann = torch.nn.Sequential(
torch.nn.Linear(dim_x, dim_h),
torch.nn.ReLU(),
torch.nn.Linear(dim_h, dim_out),
)
loss_fn = torch.nn.CrossEntropyLoss()
loss_train = [] # loss for the training set
loss_valid = [] # loss for the validation set
learning_rate = 1e-3
N = 10000 # number of gradient descent iterations
optimizer = torch.optim.Adam(model_ann.parameters(), lr=learning_rate)
for t in range(N):
y_pred = model_ann(x)
loss = loss_fn(y_pred, y_classes)
loss_train.append(loss.data.numpy().reshape((1,))[0]/len(y_classes))
y_validpred = model_ann(x_valid)
loss_v = loss_fn(y_validpred, y_validclasses)
loss_valid.append(loss_v.data.numpy().reshape((1,))[0]/len(y_validclasses))
model_ann.zero_grad() # Zero out the previous gradient computation
loss.backward() # Compute the gradient
optimizer.step() # Use the gradient information to
# make a step
Let's now plot the learning curves for the ANN:
plt.plot(range(N), loss_valid, color='b', label='validation')
plt.plot(range(N), loss_train, color='r', label='training')
plt.legend(loc='upper left')
plt.title("Learning curve for training and validation sets")
plt.xlabel("Number of iterations (N)")
plt.ylabel("Cross entropy loss")
plt.show()
Because the neural network is more flexible, we expect to see smaller costs for the training set. Let's display the next 10,000 iterations as before:
loss_train = [] # loss for the training set
loss_valid = [] # loss for the validation set
learning_rate = 1e-3
N = 10000 # number of gradient descent iterations
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_logreg.parameters(), lr=learning_rate)
for t in range(N):
y_pred = model_logreg(x)
loss = loss_fn(y_pred, y_classes)
loss_train.append(loss.data.numpy().reshape((1,))[0]/len(y_classes))
y_validpred = model_logreg(x_valid)
loss_v = loss_fn(y_validpred, y_validclasses)
loss_valid.append(loss_v.data.numpy().reshape((1,))[0]/len(y_validclasses))
model_logreg.zero_grad() # Zero out the previous gradient computation
loss.backward() # Compute the gradient
optimizer.step() # Use the gradient information to
# make a step
# Problem 3: Plotting learning curve
plt.plot(range(N, 2*N), loss_valid, color='b', label='validation')
plt.plot(range(N, 2*N), loss_train, color='r', label='training')
plt.legend(loc='upper left')
plt.title("Learning curve for training and validation sets")
plt.xlabel("Number of iterations (N)")
plt.ylabel("Cross entropy loss")
plt.show()
loss_train[-1]
0.0010073543215791383
The loss is a tiny bit smaller per training example. The validation cost is also similar.
dim_x = 4 # number of features: in this case 4 attributes
dim_h = 3 # number of hidden units
dim_out = 3 # number of classes we're predicting. We have 3 types of flowers
model = torch.nn.Sequential(
torch.nn.Linear(dim_x, dim_h),
torch.nn.ReLU(),
torch.nn.Linear(dim_h, dim_out),
)
# Problem 4: get the performance on the testing set
# idea is to train network with different dim_h and pick the one with the highest valudation set accuracy as your model of choice
y_trainpred = model(x).data.numpy()
y_validpred = model(x_valid).data.numpy()
y_testpred = model(x_test).data.numpy()
print('accuracy on training set: ', np.mean(np.argmax(y_trainpred, 1) == train_y))
print('accuracy on validation set: ', np.mean(np.argmax(y_validpred, 1) == valid_y))
print('accuracy on testing set: ', np.mean(np.argmax(y_testpred, 1) == test_y))
# Problem 5: regularization
# Idea is to just rerun the gradient descent, but add the penalty term to loss function
lam=0.01
learning_rate = 1e-3
N = 10000 # number of gradient descent iterations
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(N):
y_pred = model(x)
loss = loss_fn(y_pred, y_classes) + lam * ( torch.pow(torch.pow(torch.norm(model[0].weight), 2) + torch.pow(torch.norm(model[2].weight), 2), 0.5))
loss_train.append(loss.data.numpy().reshape((1,))[0])
y_validpred = model(x_valid)
loss_v = loss_fn(y_validpred, y_validclasses)
loss_valid.append(loss_v.data.numpy().reshape((1,))[0])
model.zero_grad() # Zero out the previous gradient computation
loss.backward() # Compute the gradient
optimizer.step() # Use the gradient information to
# make a step
y_trainpred = model(x).data.numpy()
y_validpred = model(x_valid).data.numpy()
y_testpred = model(x_test).data.numpy()
print('accuracy on training set: ', np.mean(np.argmax(y_trainpred, 1) == train_y))
print('accuracy on validation set: ', np.mean(np.argmax(y_validpred, 1) == valid_y))
print('accuracy on testing set: ', np.mean(np.argmax(y_testpred, 1) == test_y))
# Problem 6: predicting probabilities for a new instance
#SepalLength = 5.8
#SepalWidth = 5.8
#PetalLength = 5.8
#PetalWidth = 5.8
x_new = np.array([5.8,5.8,5.8,5.8])
x_newtorch = Variable(torch.from_numpy(x_new), requires_grad=False).type(dtype_float)
predictedy = model(x_newtorch).data.numpy() # these are the raw output values for each of the 3 classes
# Apply softmax to obtain class probabilities: exp(o_i) / (sum_i exp(o_i) )
#Setosa, Versicolor, and Viriginica correspond to 0, 1, and 2 respectively
probs = np.exp(predictedy) / np.sum(np.exp(predictedy))
print('predicted class label: ', np.argmax(probs))
#Problem 7: with three hidden units, take the first hidden unit, and display the weights that connect the inputs to it.
# weights from input to hidden layer:
model[0].weight # row i of the 2d array represents weights to the i_th hidden unit
weights_u1 = model[0].weight[0,:]
print(weights_u1)
accuracy on training set: 0.26666666666666666 accuracy on validation set: 0.38 accuracy on testing set: 0.375 accuracy on training set: 0.9833333333333333 accuracy on validation set: 0.94 accuracy on testing set: 0.975 predicted class label: 2 tensor([-6.0148e-41, 1.8359e-40, -1.5789e-40, 1.2537e-40], grad_fn=<SliceBackward>)