A5 1-D and 2-D Convolutional Neural Networks in Pytorch

  • A5.4: Changed structure of CNN2D.__init__ by having it call make_cnn_and_fc_layers function. It is this function that CNN1D must override., not the __init__ constructor.
  • A5.3: Added two missing statements in CNN2D that initialize layeri to 0 and increment it by 1
  • A5.2: added an exception to CNN2D.__init__ code that provides a helpful message if you specify an impossible configuration for convolutional layers. Repeat this exception code in your CNN1D.__init__ function.
  • A5.1: small edit in CNN2D to allow empty list for n_hiddens_per_fc_layer.

In this assignment, you will experiment with the given convolutional neural network for 2-dimensional input samples (images), in class CNN2D, by applying it to the MNIST data. You will also define a new class for handling 1-dimensional input samples, called CNN1D, that extends CNN2D.

In [1]:
import numpy as np
import torch
import pandas

import matplotlib.pyplot as plt

CNN2D class

Here is a definition of CNN2D.

In [4]:
import numpy as np
import torch


class CNN2D(torch.nn.Module):

    def __init__(self, n_inputs, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, n_outputs,
                 patch_size_per_conv_layer, stride_per_conv_layer, activation_function='tanh', device='cpu'):

        super().__init__()

        self.device = device

        n_conv_layers = len(n_hiddens_per_conv_layer)
        if (len(patch_size_per_conv_layer) != n_conv_layers
            or len(stride_per_conv_layer) != n_conv_layers):
            raise Exception('The lengths of n_hiddens_per_conv_layer, patch_size_per_conv_layer, and stride_per_conv_layer must be equal.')

        self.activation_function = torch.tanh if activation_function == 'tanh' else torch.relu

        self.make_conv_and_fc_layers(n_inputs, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, n_outputs,
                                     patch_size_per_conv_layer, stride_per_conv_layer)
        
        self.Xmeans = None
        self.to(self.device)

    def make_conv_and_fc_layers(self, n_inputs, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, n_outputs,
                                patch_size_per_conv_layer, stride_per_conv_layer):
                # Create all convolutional layers
        # First argument to first Conv2d is number of channels for each pixel.
        # Just 1 for our grayscale images.
        n_in = 1
        input_hw = int(np.sqrt(n_inputs))  # original input image height (=width because image assumed square)
        self.conv_layers = torch.nn.ModuleList()
        layeri = 0
        for nh, patch_size, stride in zip(n_hiddens_per_conv_layer,
                                          patch_size_per_conv_layer,
                                          stride_per_conv_layer):
            self.conv_layers.append(torch.nn.Conv2d(n_in, nh, kernel_size=patch_size, stride=stride))
            conv_layer_output_hw = (input_hw - patch_size) // stride + 1
            if conv_layer_output_hw <= 0:
                raise Exception(f'''For conv layer {layeri}, input_hw of {input_hw} is less than patch_size {patch_size}.
Try reducing the patch_size for this layer or for the previous layer.''')
            input_hw = conv_layer_output_hw  # for next trip through this loop
            n_in = nh
            layeri += 1
           
        # Create all fully connected layers.  First must determine number of inputs to first
        # fully-connected layer that results from flattening the images coming out of the last
        # convolutional layer.
        n_in = input_hw ** 2 * n_in
        self.fc_layers = torch.nn.ModuleList()
        for nh in n_hiddens_per_fc_layer:
            self.fc_layers.append(torch.nn.Linear(n_in, nh))
            n_in = nh
        self.fc_layers.append(torch.nn.Linear(n_in, n_outputs))

    def forward_all_outputs(self, X):
        n_samples = X.shape[0]
        Ys = [X]
        for conv_layer in self.conv_layers:
            Ys.append(self.activation_function(conv_layer(Ys[-1])))

        flattened_input = Ys[-1].reshape(n_samples, -1)

        for layeri, fc_layer in enumerate(self.fc_layers[:-1]):
            if layeri == 0:
                Ys.append(self.activation_function(fc_layer(flattened_input)))
            else:
                Ys.append(self.activation_function(fc_layer(Ys[-1])))

        if len(self.fc_layers) == 1:
            # only the output layer
            Ys.append(self.fc_layers[-1](flattened_input))
        else:
            Ys.append(self.fc_layers[-1](Ys[-1]))

        return Ys

    def forward(self, X):
        Ys = self.forward_all_outputs(X)
        return Ys[-1]

    def train(self, X, T, batch_size, n_epochs, learning_rate, method='sgd', verbose=True):
        '''X and T must be numpy arrays'''

        self.classes = np.unique(T)
        T = np.arange(len(self.classes))[np.where(T.reshape(-1, 1) == self.classes)[1]]

        # Set data matrices to torch.tensors
        X = torch.from_numpy(X).float().to(self.device)
        T = torch.from_numpy(T).long().to(self.device)  # required for classification in pytorch

        # Setup standardization parameters
        if self.Xmeans is None:
            self.Xmeans = X.mean(axis=0)
            self.Xstds = X.std(axis=0)
            self.Xstds[self.Xstds == 0] = 1  # So we don't divide by zero when standardizing

        # Standardize X
        X = (X - self.Xmeans) / self.Xstds

        X.requires_grad_(True)

        if method == 'sgd':
            optimizer = torch.optim.SGD(self.parameters(), lr=learning_rate, momentum=0.9)
        else:
            optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

        CELoss = torch.nn.CrossEntropyLoss(reduction='mean')
        self.error_trace = []

        for epoch in range(n_epochs):

            num_batches = X.shape[0] // batch_size
            loss_sum = 0

            for k in range(num_batches):
                start = k * batch_size
                end = (k + 1) * batch_size
                X_batch = X[start:end, ...]
                T_batch = T[start:end, ...]

                Y = self.forward(X_batch)

                loss = CELoss(Y, T_batch)
                loss.backward()

                # Update parameters
                optimizer.step()
                optimizer.zero_grad()

                loss_sum += loss

            self.error_trace.append(loss_sum / num_batches)

            if verbose and (epoch + 1) % (max(1, n_epochs // 10)) == 0:
                print(f'{method}: Epoch {epoch + 1} Loss {self.error_trace[-1]:.3f}')

        return self

    def softmax(self, Y):
        '''Apply to final layer weighted sum outputs'''
        # Trick to avoid overflow
        maxY = torch.max(Y, axis=1)[0].reshape((-1, 1))
        expY = torch.exp(Y - maxY)
        denom = torch.sum(expY, axis=1).reshape((-1, 1))
        Y = expY / denom
        return Y

    def use(self, X):
        # Set input matrix to torch.tensors
        X = torch.from_numpy(X).float().to(self.device)
        # Standardize X
        X = (X - self.Xmeans) / self.Xstds
        # Calculate output of net for all samples in X
        Y = self.forward(X)
        # Convert output to class probabilities
        probs = self.softmax(Y)
        # For each sample pick highest probability and translate that to class labels
        classes = self.classes[torch.argmax(probs, axis=1).cpu().numpy()].reshape(-1, 1)
        return classes, probs.detach().cpu().numpy()

CNN2D on MNIST Digits

We will use a bunch (50,000) images of hand drawn digits from this deeplearning.net site. Download mnist.pkl.gz.

This pickle file includes data already partitioned into training, validation, and test sets. To read it into python, use the following steps

In [7]:
import pickle
import gzip

with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')

Xtrain = train_set[0]
Ttrain = train_set[1].reshape(-1, 1)

Xval = valid_set[0]
Tval = valid_set[1].reshape(-1, 1)

Xtest = test_set[0]
Ttest = test_set[1].reshape(-1, 1)

Xtrain = Xtrain.reshape(-1, 1, 28, 28)
Xtest = Xtest.reshape(-1, 1, 28, 28)

print(Xtrain.shape, Ttrain.shape,  Xval.shape, Tval.shape,  Xtest.shape, Ttest.shape)
(50000, 1, 28, 28) (50000, 1) (10000, 784) (10000, 1) (10000, 1, 28, 28) (10000, 1)
In [17]:
device = 'cpu'
if torch.cuda.is_available():
    y_or_n = input('Would you like to run on the GPU? (y or n): ')
    if y_or_n == 'y' or y_or_n == 'yes':
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on', device)
Would you like to run on the GPU? (y or n): y
Running on cuda:0
In [9]:
len(np.unique(Ttrain))
Out[9]:
10
In [10]:
n_hiddens_per_conv_layer = [10, 10]
patch_size_per_conv_layer = [10, 5]
stride_per_conv_layer=[4, 2]
n_hiddens_per_fc_layer = [5]

cnnet = CNN2D(28 * 28, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, len(np.unique(Ttrain)), 
              patch_size_per_conv_layer, stride_per_conv_layer, device=device)

n_epochs = 20
batch_size = 500
learning_rate = 0.01

cnnet.train(Xtrain, Ttrain, batch_size, n_epochs, learning_rate, method='adam')

plt.plot(cnnet.error_trace, label='Pytorch')
plt.title('MNIST')
adam: Epoch 2 Loss 0.451
adam: Epoch 4 Loss 0.289
adam: Epoch 6 Loss 0.246
adam: Epoch 8 Loss 0.229
adam: Epoch 10 Loss 0.218
adam: Epoch 12 Loss 0.204
adam: Epoch 14 Loss 0.194
adam: Epoch 16 Loss 0.191
adam: Epoch 18 Loss 0.183
adam: Epoch 20 Loss 0.181
Out[10]:
Text(0.5, 1.0, 'MNIST')
In [19]:
def confusion_matrix(Y_classes, T):
    class_names = np.unique(T)
    table = []
    for true_class in class_names:
        row = []
        for Y_class in class_names:
            row.append(100 * np.mean(Y_classes[T == true_class] == Y_class))
        table.append(row)
    conf_matrix = pandas.DataFrame(table, index=class_names, columns=class_names)
    return conf_matrix
In [12]:
Classes, _ = cnnet.use(Xtest)
perc_correct = 100 * np.mean(Classes == Ttest)
print(f'Test accuracy in percent correct: {perc_correct:.2f}')
confusion_matrix(Classes, Ttest)
Test accuracy in percent correct: 93.75
Out[12]:
0 1 2 3 4 5 6 7 8 9
0 95.918367 0.000000 0.510204 0.408163 0.102041 0.612245 1.734694 0.408163 0.102041 0.204082
1 0.000000 97.797357 0.616740 0.176211 0.088106 0.000000 0.528634 0.088106 0.616740 0.088106
2 0.581395 0.581395 94.573643 1.162791 0.096899 0.484496 0.290698 0.872093 1.356589 0.000000
3 0.099010 0.000000 2.277228 91.683168 0.000000 1.485149 0.198020 1.089109 3.168317 0.000000
4 0.101833 0.000000 0.000000 0.101833 94.399185 0.000000 1.120163 0.712831 0.712831 2.851324
5 0.224215 0.000000 0.448430 1.569507 0.672646 92.040359 1.008969 0.336323 3.699552 0.000000
6 0.835073 0.521921 0.521921 0.000000 0.521921 1.043841 96.555324 0.000000 0.000000 0.000000
7 0.194553 0.291829 2.431907 0.875486 0.389105 0.194553 0.000000 94.163424 0.583658 0.875486
8 0.205339 1.129363 0.821355 3.182752 0.102669 0.924025 0.410678 0.821355 91.683778 0.718686
9 0.594648 0.495540 0.099108 0.891972 5.351833 0.396432 0.099108 2.378593 1.585728 88.107037

Experiments

To explore the effects of different CNN structures, show results for the following steps. For each use the same number of epochs, batch size, and learning rate as used above.

  1. Compare test accuracy of CNN2D nets with one, two and three convolutional layers, each with 10 units and patch sizes of 5 and strides of 2.
  2. Using the best number of convolutional layers found in Step 1, compare the test accuracies of CNN2d nets with zero, one, and two fully-connected layers each with 10 hidden units.

Combine the results of each of your runs and display them in a pandas.Dataframe that includes the network structure and percent correct on train and test sets. Discuss your results, and describe the network structure and training parameters that produced the best test results.

CNN1D

Complete the following code cell to define CNN1D. The only change from CNN2D that is required is in the constructor. Complete these steps.

  1. Copy the __init__ function from CNN2D.
  2. For each convolutional layer, create a torch.nn.Conv1d object instead of a torch.nn.Conv2d object.
  3. Modify the statement input_hw = int(np.sqrt(n_inputs)) appropriately.
  4. Modify the statement n_in = input_hw ** 2 * n_in appropriately.
In [ ]:
class CNN1D(CNN2D):

    def make_conv_and_fc_layers(self, n_inputs, n_hiddens_per_conv_layer, n_hiddens_per_fc_layer, n_outputs,
                                patch_size_per_conv_layer, stride_per_conv_layer):

        . . .

Toy Data to Test CNN1D

Here is some toy data to test your CNN1D definition. Each sample is now 1-dimensional. Let's make vectors of two kinds, ones with square pulses and ones with triangular pulses, at random locations and random durations. Both kinds will be 100 values, with zeros between the waves long.

In [3]:
def make_square_pulse():
    sample = np.zeros(100)
    for i in range(np.random.randint(1, 5)):  # making from 1 to 4 pulses
        start = np.random.randint(0, 80)
        width = np.random.randint(5, 20)
        sample[start:start + width] = 1
    return sample
In [7]:
make_square_pulse()
Out[7]:
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
In [9]:
plt.plot(make_square_pulse());
In [10]:
def make_triangular_pulse():
    sample = np.zeros(100)
    for i in range(np.random.randint(1, 5)):  # making from 1 to 4 pulses
        start = np.random.randint(0, 80)
        width = np.random.randint(5, 20)
        if width % 2 == 1:
            width += 1  # if odd, make it even
        sample[start:start + width // 2] = np.linspace(0, 1, width // 2)
        sample[start + width // 2:start + width] = np.linspace(1, 0, width // 2)
    return sample
In [11]:
plt.plot(make_triangular_pulse());
In [12]:
n_each = 500
Xtrain = np.array([make_square_pulse() for i in range(n_each)] +
                   [make_triangular_pulse() for i in range(n_each)])
Ttrain = np.array(['square'] * n_each + ['triangular'] * n_each).reshape(-1, 1)
n_each = 500
Xtest = np.array([make_square_pulse() for i in range(n_each)] +
                   [make_triangular_pulse() for i in range(n_each)])
Ttest = np.array(['square'] * n_each + ['triangular'] * n_each).reshape(-1, 1)
Xtrain.shape, Ttrain.shape, Xtest.shape, Ttest.shape
Out[12]:
((1000, 100), (1000, 1), (1000, 100), (1000, 1))
In [13]:
np.newaxis == None
Out[13]:
True
In [14]:
Xtrain = Xtrain[:, None, :]
Xtrain.shape
Out[14]:
(1000, 1, 100)
In [15]:
Xtrain = Xtrain.reshape(Xtrain.shape[0], 1, -1)
Xtest = Xtest.reshape(Xtest.shape[0], 1, -1)
Xtrain.shape, Xtest.shape
Out[15]:
((1000, 1, 100), (1000, 1, 100))
In [18]:
cnnet1 = CNN1D(100, [10, 5], [5, 5], 2, [10, 4], [1, 2], device=device)

n_epochs = 100
batch_size = 10
learning_rate = 0.001

cnnet1.train(Xtrain, Ttrain, batch_size, n_epochs, learning_rate, method='adam')

plt.plot(cnnet1.error_trace, label='Pytorch')
plt.title('Pulses')
adam: Epoch 10 Loss 0.682
adam: Epoch 20 Loss 0.386
adam: Epoch 30 Loss 0.109
adam: Epoch 40 Loss 0.024
adam: Epoch 50 Loss 0.003
adam: Epoch 60 Loss 0.001
adam: Epoch 70 Loss 0.001
adam: Epoch 80 Loss 0.000
adam: Epoch 90 Loss 0.000
adam: Epoch 100 Loss 0.000
Out[18]:
Text(0.5, 1.0, 'Pulses')
In [20]:
Classes, _ = cnnet1.use(Xtest)
perc_correct = 100 * np.mean(Classes == Ttest)
print(f'Test accuracy in percent correct: {perc_correct:.2f}')
confusion_matrix(Classes, Ttest)
Test accuracy in percent correct: 99.10
Out[20]:
square triangular
square 98.2 1.8
triangular 0.0 100.0
In [21]:
W = list(cnnet1.children())[0][0].weight.data.cpu()  # in case running on GPU
plt.plot(W[:, 0, :].T);
W[:, 0, :].T.shape
Out[21]:
torch.Size([10, 10])

Experiments on ECG Data

An electrocardiogram, or ECG, is a record in time of a voltage generated by the heart. It can be used to diagnose abnormalities in the heart.

Public datasets containing ECG traces are available, such as the Non-Invasive Fetal ECG Arrhythmia Database site. The data files there are in a standard waveform-database (WFDB) format. As is often the case for most standard data formats you run in to, a python package exists for reading this data, called wfdb that you can install using conda.

This data set includes ECG from normal patients and from ones with arrythmias, with data file names like ARR_01.dat and NR_01.dat, respectively. We have already downloaded these files, read them in using the wfdb package and collected them into segments of 2000 voltages. The sample rate for this data is 1000 Hz, so 2000 voltages spans 2 seconds. Download this data set from ecg.npy

Now, our job for our CNN1D is to classify each 2000 sample segment into the classes normal or arrythmia.

After you have downloaded ecg.npy, you can load it and plot a few samples.

In [22]:
ecg = np.load('ecg.npy')
arr = ecg['arrythmia']
norm = ecg['normal']
arr.shape, norm.shape
Out[22]:
((2429, 2000), (3634, 2000))
In [23]:
plt.figure(figsize=(15, 15))

plt.subplot(2, 2, 1)
plt.plot(arr[0])
plt.legend(('Arrythmia',))
plt.subplot(2, 2, 2)
plt.plot(arr[100])
plt.legend(('Arrythmia',))

plt.subplot(2, 2, 3)
plt.plot(norm[0])
plt.legend(('Normal',))
plt.subplot(2, 2, 4)
plt.plot(norm[100])
plt.legend(('Normal',));

Now, let's stack the arr and norm samples together, create class labels for each sample, randomly rearrange them, and divide into train and test sets.

In [24]:
X = np.vstack((arr, norm))
X = X.reshape(X.shape[0], 1, -1)
T = np.hstack((['arr'] * arr.shape[0], ['nr'] * norm.shape[0])).reshape(-1, 1)
n_samples = X.shape[0]
rows = np.arange(n_samples)
np.random.shuffle(rows)
n_train = int(n_samples * 0.8)
Xtrain = X[rows[:n_train], ...]
Ttrain = T[rows[:n_train], ...]
Xtest = X[rows[n_train:], ...]
Ttest = T[rows[n_train:], ...]

Xtrain.shape, Ttrain.shape, Xtest.shape, Ttest.shape
Out[24]:
((4850, 1, 2000), (4850, 1), (1213, 1, 2000), (1213, 1))
In [25]:
Ttrain
Out[25]:
array([['nr'],
       ['arr'],
       ['nr'],
       ...,
       ['nr'],
       ['nr'],
       ['nr']], dtype='<U3')

Okay, ready to train. Create a CNN1D network with a statements like this.

In [34]:
cnn1d = CNN1D(Xtrain.shape[-1], [5, 10], [10, 10], 2, [100, 20], [20, 5], device=device)

Now, experiment with at least ten different network structures, patch sizes and strides and compare them with the percent accuracy on test data. Combine the results of each of your runs and display them in a pandas.Dataframe that includes the network structure and percent correct on train and test sets. Discuss your results, and describe the network structure and training parameters that produced the best test results.

Grading and Check-In

Download A5grader.zip and extract A5grader.py from it. Run the code in the following cell to demonstrate an example grading session. Remember to test your code with additional tests of your own design. Your notebook must be named as Lastname-A5.ipynb.

When ready, submit your notebook via the A5 link in our class Canvas web page.

In [14]:
%run -i A5grader.py
Testing

    xs = np.arange(100)
    n_each = 500
    n_samples = n_each * 2
    X = np.array([np.sin(xs / 2) + np.random.normal(0, 1, size=100) for i in range(n_each)] +
                 [np.sin(xs / 3) + np.random.normal(0, 1, size=100) for i in range(n_each)])
    X = X[:, np.newaxis, :]
    T = np.array([2] * n_each + [3] * n_each).reshape(-1, 1)
    rows = np.arange(n_samples)
    np.random.shuffle(rows)
    X = X[rows, ...]
    T = T[rows, ...]
    n_train = int(n_samples * 0.8)
    Xtrain = X[:n_train, ...]
    Ttrain = T[:n_train, :]
    Xtest = X[n_train:, ...]
    Ttest = T[n_train:, :]

    cnn1d = CNN1D(100, [5, 5], [3], 2, [10, 5], [1, 2])
    cnn1d.train(Xtrain, Ttrain, 10, 20, 0.01, method='adam')

    perc_train = 100 * np.mean(cnn1d.use(Xtrain)[0] == Ttrain)
    perc_test = 100 * np.mean(cnn1d.use(Xtest)[0] == Ttest)

adam: Epoch 2 Loss 0.016
adam: Epoch 4 Loss 0.005
adam: Epoch 6 Loss 0.002
adam: Epoch 8 Loss 0.002
adam: Epoch 10 Loss 0.001
adam: Epoch 12 Loss 0.001
adam: Epoch 14 Loss 0.001
adam: Epoch 16 Loss 0.000
adam: Epoch 18 Loss 0.000
adam: Epoch 20 Loss 0.000

--- 40/40 points.  perc_train is correctly 100%.

--- 40/40 points.  perc_test is correctly 100%.

======================================================================
A5 Execution Grade is 80 / 80
======================================================================

 __ / 10 Based on results of experiments with MNIST data.

 __ / 10 Based on results of experiments with ECG data.

======================================================================
A5 FINAL GRADE is  _  / 100
======================================================================

Extra Credit:

Earn up to 3 extra credit points on this assignment by doing any or all of the following experiments.

1. Compare your results on the MNIST data by using relu versus tanh activation functions. Show and 
   discuss the results.

2. Compare your results on the MNIST data using adam versus sgd. Show and discuss the results.

3. Download another image data set, apply your CNN2D class to this data and discuss the results.

A5 EXTRA CREDIT is 0 / 3

Extra Credit

Earn up to 3 extra credit points on this assignment by doing any or all of the following experiments.

  1. Compare your results on the MNIST data by using relu versus tanh activation functions. Show and discuss the results.
  2. Compare your results on the MNIST data using adam versus sgd. Show and discuss the results.
  3. Download another image data set, apply your CNN2D class to this data and discuss the results.