Implementation of a method for ordinal regression by Polat et al 2022 [1].
Paper reference:
We will be using the cement_strength dataset from https://github.com/gagolews/ordinal_regression_data/blob/master/cement_strength.csv.
First, we are going to download and prepare the and save it as CSV files locally. This is a general procedure that is not specific to CORN.
This dataset has 5 ordinal labels (1, 2, 3, 4, and 5). Note that we require labels to be starting at 0, which is why we subtract "1" from the label column.
import pandas as pd
import numpy as np
data_df = pd.read_csv("https://raw.githubusercontent.com/gagolews/ordinal_regression_data/master/cement_strength.csv")
data_df["response"] = data_df["response"]-1 # labels should start at 0
data_labels = data_df["response"]
data_features = data_df.loc[:, ["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8"]]
print('Number of features:', data_features.shape[1])
print('Number of examples:', data_features.shape[0])
print('Labels:', np.unique(data_labels.values))
Number of features: 8 Number of examples: 998 Labels: [0 1 2 3 4]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
data_features.values,
data_labels.values,
test_size=0.2,
random_state=1,
stratify=data_labels.values)
/Users/sebastianraschka/miniforge3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1 warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
In this section, we set up the data set and data loaders. This is a general procedure that is not specific to CORN.
import torch
##########################
### SETTINGS
##########################
# Hyperparameters
random_seed = 1
learning_rate = 0.0001
num_epochs = 50
batch_size = 128
# Architecture
NUM_CLASSES = 5
# Other
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on', DEVICE)
Training on cpu
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self, feature_array, label_array, dtype=np.float32):
self.features = feature_array.astype(np.float32)
self.labels = label_array
def __getitem__(self, index):
inputs = self.features[index]
label = self.labels[index]
return inputs, label
def __len__(self):
return self.labels.shape[0]
import torch
from torch.utils.data import DataLoader
# Note transforms.ToTensor() scales input images
# to 0-1 range
train_dataset = MyDataset(X_train_std, y_train)
test_dataset = MyDataset(X_test_std, y_test)
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True, # want to shuffle the dataset
num_workers=0) # number processes/CPUs to use
test_loader = DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0)
# Checking the dataset
for inputs, labels in train_loader:
print('Input batch dimensions:', inputs.shape)
print('Input label dimensions:', labels.shape)
break
Input batch dimensions: torch.Size([128, 8]) Input label dimensions: torch.Size([128])
According to the paper, the loss is described as follows:
$$\mathbf{C D WC E}=-\sum_{i=0}^{N-1} \log (1-\hat{y}) \times|i-c|^{\text {power }}$$where
import torch.nn.functional as F
targets = torch.tensor([0, 2, 1, 2])
logits = torch.tensor( [[-0.3, -0.5, -0.5], # each row is 1 training example
[-0.4, -0.1, -0.5],
[-0.3, -0.94, -0.5],
[-0.99, -0.88, -0.5]])
probas = F.softmax(logits, dim=1)
probas
tensor([[0.3792, 0.3104, 0.3104], [0.3072, 0.4147, 0.2780], [0.4263, 0.2248, 0.3490], [0.2668, 0.2978, 0.4354]])
def cdw_ce_loss_naive1(probas, targets, power=5):
loss = torch.zeros(probas.shape[0])
for example in range(probas.shape[0]):
for i in range(probas.shape[1]):
loss[example] += -torch.log(1-probas[example, i]) * torch.abs(i - targets[example])**power
return loss
cdw_ce_loss_naive1(probas, targets)
tensor([12.2654, 12.2824, 0.9848, 10.2828])
%timeit cdw_ce_loss_naive1(probas, targets)
204 µs ± 3.06 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
def cdw_ce_loss_naive2(probas, targets, power=5):
loss = 0.
for i in range(probas.shape[1]):
loss += (-torch.log(1-probas[:, i]) * torch.abs(i - targets)**power)
return loss
cdw_ce_loss_naive2(probas, targets)
tensor([12.2654, 12.2824, 0.9848, 10.2828])
%timeit cdw_ce_loss_naive2(probas, targets)
44.8 µs ± 542 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
def cdw_ce_loss_naive3(probas, targets, power=5):
labels = torch.arange(probas.shape[1]).repeat(probas.shape[0], 1)
loss = (-torch.log(1-probas) * torch.abs(labels - targets.reshape(probas.shape[0], 1))**power).sum(dim=1)
return loss
cdw_ce_loss_naive3(probas, targets)
tensor([12.2654, 12.2824, 0.9848, 10.2828])
%timeit cdw_ce_loss_naive3(probas, targets)
19.3 µs ± 95.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
def cdw_ce_loss(logits, targets, power=5, reduction="mean"):
probas = torch.softmax(logits, dim=1)
labels = torch.arange(probas.shape[1]).repeat(probas.shape[0], 1)
loss = (-torch.log(1-probas) * torch.abs(labels - targets.reshape(probas.shape[0], 1))**power).sum(dim=1)
if reduction == "none":
return loss
elif reduction == "sum":
return loss.sum()
elif reduction == "mean":
return loss.mean()
else:
raise ValueError("reduction must be 'none', 'sum', or 'mean'")
cdw_ce_loss(logits, targets)
tensor(8.9538)
class MLP(torch.nn.Module):
def __init__(self, in_features, num_classes, num_hidden_1=300, num_hidden_2=300):
super().__init__()
self.num_classes = num_classes
self.my_network = torch.nn.Sequential(
# 1st hidden layer
torch.nn.Linear(in_features, num_hidden_1, bias=False),
torch.nn.LeakyReLU(),
torch.nn.Dropout(0.2),
torch.nn.BatchNorm1d(num_hidden_1),
# 2nd hidden layer
torch.nn.Linear(num_hidden_1, num_hidden_2, bias=False),
torch.nn.LeakyReLU(),
torch.nn.Dropout(0.2),
torch.nn.BatchNorm1d(num_hidden_2),
# Output layer
torch.nn.Linear(num_hidden_2, num_classes)
)
def forward(self, x):
logits = self.my_network(x)
return logits
torch.manual_seed(random_seed)
model = MLP(in_features=8, num_classes=NUM_CLASSES)
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
for epoch in range(num_epochs):
model = model.train()
for batch_idx, (features, class_labels) in enumerate(train_loader):
class_labels = class_labels.to(DEVICE)
features = features.to(DEVICE)
logits = model(features)
loss = cdw_ce_loss(logits, class_labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
### LOGGING
if not batch_idx % 200:
print ('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
%(epoch+1, num_epochs, batch_idx,
len(train_loader), loss))
Epoch: 001/050 | Batch 000/007 | Loss: 134.0854 Epoch: 002/050 | Batch 000/007 | Loss: 16.1060 Epoch: 003/050 | Batch 000/007 | Loss: 16.2816 Epoch: 004/050 | Batch 000/007 | Loss: 12.1848 Epoch: 005/050 | Batch 000/007 | Loss: 9.2307 Epoch: 006/050 | Batch 000/007 | Loss: 5.7477 Epoch: 007/050 | Batch 000/007 | Loss: 4.4735 Epoch: 008/050 | Batch 000/007 | Loss: 4.3764 Epoch: 009/050 | Batch 000/007 | Loss: 4.5788 Epoch: 010/050 | Batch 000/007 | Loss: 3.4303 Epoch: 011/050 | Batch 000/007 | Loss: 4.2646 Epoch: 012/050 | Batch 000/007 | Loss: 2.7244 Epoch: 013/050 | Batch 000/007 | Loss: 3.5027 Epoch: 014/050 | Batch 000/007 | Loss: 3.1227 Epoch: 015/050 | Batch 000/007 | Loss: 1.9005 Epoch: 016/050 | Batch 000/007 | Loss: 4.4430 Epoch: 017/050 | Batch 000/007 | Loss: 2.2113 Epoch: 018/050 | Batch 000/007 | Loss: 2.9496 Epoch: 019/050 | Batch 000/007 | Loss: 2.4737 Epoch: 020/050 | Batch 000/007 | Loss: 2.2458 Epoch: 021/050 | Batch 000/007 | Loss: 2.2490 Epoch: 022/050 | Batch 000/007 | Loss: 2.5528 Epoch: 023/050 | Batch 000/007 | Loss: 3.3671 Epoch: 024/050 | Batch 000/007 | Loss: 1.6624 Epoch: 025/050 | Batch 000/007 | Loss: 1.5456 Epoch: 026/050 | Batch 000/007 | Loss: 1.8861 Epoch: 027/050 | Batch 000/007 | Loss: 1.5842 Epoch: 028/050 | Batch 000/007 | Loss: 2.4168 Epoch: 029/050 | Batch 000/007 | Loss: 1.6376 Epoch: 030/050 | Batch 000/007 | Loss: 1.8073 Epoch: 031/050 | Batch 000/007 | Loss: 2.5007 Epoch: 032/050 | Batch 000/007 | Loss: 1.4211 Epoch: 033/050 | Batch 000/007 | Loss: 1.9054 Epoch: 034/050 | Batch 000/007 | Loss: 1.3790 Epoch: 035/050 | Batch 000/007 | Loss: 2.0045 Epoch: 036/050 | Batch 000/007 | Loss: 2.9551 Epoch: 037/050 | Batch 000/007 | Loss: 1.3715 Epoch: 038/050 | Batch 000/007 | Loss: 1.7470 Epoch: 039/050 | Batch 000/007 | Loss: 1.7664 Epoch: 040/050 | Batch 000/007 | Loss: 1.3839 Epoch: 041/050 | Batch 000/007 | Loss: 1.1422 Epoch: 042/050 | Batch 000/007 | Loss: 1.1026 Epoch: 043/050 | Batch 000/007 | Loss: 1.4556 Epoch: 044/050 | Batch 000/007 | Loss: 1.0135 Epoch: 045/050 | Batch 000/007 | Loss: 1.7183 Epoch: 046/050 | Batch 000/007 | Loss: 1.2620 Epoch: 047/050 | Batch 000/007 | Loss: 1.5380 Epoch: 048/050 | Batch 000/007 | Loss: 1.0275 Epoch: 049/050 | Batch 000/007 | Loss: 1.3223 Epoch: 050/050 | Batch 000/007 | Loss: 1.1917
Finally, after model training, we can evaluate the performance of the model. For example, via the mean absolute error and mean squared error measures.
For this, we are going to use the beckham_logits_to_labels
to convert the logits into ordinal class labels as shown below:
def logits_to_labels(logits, model):
predictions = torch.argmax(logits, dim=1)
return predictions
def compute_mae_and_mse(model, data_loader, device):
with torch.inference_mode():
mae, mse, acc, num_examples = 0., 0., 0., 0
for i, (features, targets) in enumerate(data_loader):
features = features.to(device)
targets = targets.float().to(device)
logits = model(features)
predicted_labels = logits_to_labels(logits, model)
num_examples += targets.size(0)
mae += torch.sum(torch.abs(predicted_labels - targets))
mse += torch.sum((predicted_labels - targets)**2)
mae = mae / num_examples
mse = mse / num_examples
return mae, mse
train_mae, train_mse = compute_mae_and_mse(model, train_loader, DEVICE)
test_mae, test_mse = compute_mae_and_mse(model, test_loader, DEVICE)
print(f'Mean absolute error (train/test): {train_mae:.2f} | {test_mae:.2f}')
print(f'Mean squared error (train/test): {train_mse:.2f} | {test_mse:.2f}')
Mean absolute error (train/test): 0.36 | 0.41 Mean squared error (train/test): 0.37 | 0.46