# uncomment and install dependencies before continuing
# !pip install --upgrade inFairness requests tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from inFairness.fairalgo import SenSeI
from inFairness import distances
from inFairness.auditor import SenSRAuditor, SenSeIAuditor
%load_ext autoreload
%autoreload 2
import data
import metrics
/Users/mayank/opt/anaconda3/envs/infairness/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
class AdultDataset(Dataset):
def __init__(self, data, labels):
self.data = data
self.labels = labels
def __getitem__(self, idx):
data = self.data[idx]
label = self.labels[idx]
return data, label
def __len__(self):
return len(self.labels)
train_df, test_df = data.load_data()
X_train_df, Y_train_df = train_df
X_test_df, Y_test_df = test_df
# Let's drop the protected attributes from the training and test data and store them in a
# separate dataframe that we'll use later to train the individually fair metric.
protected_vars = ['race_White', 'sex_Male']
X_protected_df = X_train_df[protected_vars]
X_train_df = X_train_df.drop(columns=protected_vars)
X_test_df = X_test_df.drop(columns=protected_vars)
# Create test data with spouse variable flipped
X_test_df_spouse_flipped = X_test_df.copy()
X_test_df_spouse_flipped.relationship_Wife = 1 - X_test_df_spouse_flipped.relationship_Wife
X_train_df.head()
age | capital-gain | capital-loss | education-num | hours-per-week | marital-status_Divorced | marital-status_Married-AF-spouse | marital-status_Married-civ-spouse | marital-status_Married-spouse-absent | marital-status_Never-married | ... | relationship_Own-child | relationship_Unmarried | relationship_Wife | workclass_Federal-gov | workclass_Local-gov | workclass_Private | workclass_Self-emp-inc | workclass_Self-emp-not-inc | workclass_State-gov | workclass_Without-pay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.409331 | -0.14652 | -0.218253 | -1.613806 | -0.496770 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | -1.104187 | -0.14652 | -0.218253 | -0.050064 | -1.741764 | 0 | 0 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 1.393118 | -0.14652 | -0.218253 | -0.440999 | 2.574214 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | -0.423104 | -0.14652 | -0.218253 | -0.440999 | 1.163221 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4 | -0.877159 | -0.14652 | -0.218253 | 1.122743 | 0.748224 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 39 columns
device = torch.device('cpu')
# Convert all pandas dataframes to PyTorch tensors
X_train, y_train = data.convert_df_to_tensor(X_train_df, Y_train_df)
X_test, y_test = data.convert_df_to_tensor(X_test_df, Y_test_df)
X_test_flip, y_test_flip = data.convert_df_to_tensor(X_test_df_spouse_flipped, Y_test_df)
X_protected = torch.tensor(X_protected_df.values).float()
# Create the training and testing dataset
train_ds = AdultDataset(X_train, y_train)
test_ds = AdultDataset(X_test, y_test)
test_ds_flip = AdultDataset(X_test_flip, y_test_flip)
# Create train and test dataloaders
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=1000, shuffle=False)
test_dl_flip = DataLoader(test_ds_flip, batch_size=1000, shuffle=False)
# Create a fully connected neural network
class Model(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, 100)
self.fc2 = nn.Linear(100, 100)
self.fcout = nn.Linear(100, output_size)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fcout(x)
return x
input_size = X_train.shape[1]
output_size = 2
network_standard = Model(input_size, output_size).to(device)
optimizer = torch.optim.Adam(network_standard.parameters(), lr=1e-3)
loss_fn = F.cross_entropy
EPOCHS = 10
network_standard.train()
for epoch in tqdm(range(EPOCHS)):
for x, y in train_dl:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
y_pred = network_standard(x).squeeze()
loss = loss_fn(y_pred, y)
loss.backward()
optimizer.step()
100%|██████████| 10/10 [00:03<00:00, 2.57it/s]
accuracy = metrics.accuracy(network_standard, test_dl, device)
balanced_acc = metrics.balanced_accuracy(network_standard, test_dl, device)
spouse_consistency = metrics.spouse_consistency(network_standard, test_dl, test_dl_flip, device)
print(f'Accuracy: {accuracy}')
print(f'Balanced accuracy: {balanced_acc}')
print(f'Spouse consistency: {spouse_consistency}')
Accuracy: 0.855042040348053 Balanced accuracy: 0.7806884556970295 Spouse consistency: 0.9593100398053959
network_fair_LR = Model(input_size, output_size).to(device)
optimizer = torch.optim.Adam(network_fair_LR.parameters(), lr=1e-3)
lossfn = F.cross_entropy
distance_x_LR = distances.LogisticRegSensitiveSubspace()
distance_y = distances.SquaredEuclideanDistance()
distance_x_LR.fit(X_train, data_SensitiveAttrs=X_protected)
distance_y.fit(num_dims=output_size)
distance_x_LR.to(device)
distance_y.to(device)
rho = 5.0
eps = 0.1
auditor_nsteps = 100
auditor_lr = 1e-3
fairalgo_LR = SenSeI(network_fair_LR, distance_x_LR, distance_y, lossfn, rho, eps, auditor_nsteps, auditor_lr)
fairalgo_LR.train()
for epoch in tqdm(range(EPOCHS)):
for x, y in train_dl:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
result = fairalgo_LR(x, y)
result.loss.backward()
optimizer.step()
100%|██████████| 10/10 [03:02<00:00, 18.29s/it]
accuracy = metrics.accuracy(network_fair_LR, test_dl, device)
balanced_acc = metrics.balanced_accuracy(network_fair_LR, test_dl, device)
spouse_consistency = metrics.spouse_consistency(network_fair_LR, test_dl, test_dl_flip, device)
print(f'Accuracy: {accuracy}')
print(f'Balanced accuracy: {balanced_acc}')
print(f'Spouse consistency: {spouse_consistency}')
Accuracy: 0.8369084596633911 Balanced accuracy: 0.7357549314737899 Spouse consistency: 0.9998894294559929
Y_gender = X_protected[:, -1]
X1, X2, Y_pairs = data.create_data_pairs(X_train, y_train, Y_gender)
distance_x_explore = distances.EXPLOREDistance()
distance_x_explore.fit(X1, X2, Y_pairs, iters=1000, batchsize=10000)
distance_x_explore.to(device)
/Users/mayank/Documents/[Projects]/open-source/inFairness/examples/adult-income-prediction/../../inFairness/distances/explore_distance.py:76: RuntimeWarning: overflow encountered in exp sclVec = 2.0 / (np.exp(diag) - 1)
network_fair_explore = Model(input_size, output_size).to(device)
optimizer = torch.optim.Adam(network_fair_explore.parameters(), lr=1e-3)
lossfn = F.cross_entropy
rho = 25.0
eps = 0.1
auditor_nsteps = 10
auditor_lr = 1e-2
fairalgo_explore = SenSeI(network_fair_explore, distance_x_explore, distance_y, lossfn, rho, eps, auditor_nsteps, auditor_lr)
fairalgo_explore.train()
for epoch in tqdm(range(EPOCHS)):
for x, y in train_dl:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
result = fairalgo_explore(x, y)
result.loss.backward()
optimizer.step()
100%|██████████| 10/10 [00:24<00:00, 2.42s/it]
accuracy = metrics.accuracy(network_fair_explore, test_dl, device)
balanced_acc = metrics.balanced_accuracy(network_fair_explore, test_dl, device)
spouse_consistency = metrics.spouse_consistency(network_fair_explore, test_dl, test_dl_flip, device)
print(f'Accuracy: {accuracy}')
print(f'Balanced accuracy: {balanced_acc}')
print(f'Spouse consistency: {spouse_consistency}')
Accuracy: 0.8224236965179443 Balanced accuracy: 0.6999390313607438 Spouse consistency: 0.9997788589119858
# Auditing using the SenSR Auditor + LR metric
audit_nsteps = 1000
audit_lr = 0.1
auditor_LR = SenSRAuditor(loss_fn=loss_fn, distance_x=distance_x_LR, num_steps=audit_nsteps, lr=audit_lr, max_noise=0.5, min_noise=-0.5)
audit_result_stdmodel = auditor_LR.audit(network_standard, X_test, y_test, lambda_param=10.0, audit_threshold=1.15)
audit_result_fairmodel_LR = auditor_LR.audit(network_fair_LR, X_test, y_test, lambda_param=10.0, audit_threshold=1.15)
audit_result_fairmodel_explore = auditor_LR.audit(network_fair_explore, X_test, y_test, lambda_param=10.0, audit_threshold=1.15)
print("="*100)
print("LR metric")
print(f"Loss ratio (Standard model) : {audit_result_stdmodel.lower_bound}. Is model fair: {audit_result_stdmodel.is_model_fair}")
print(f"Loss ratio (fair model - LogReg metric) : {audit_result_fairmodel_LR.lower_bound}. Is model fair: {audit_result_fairmodel_LR.is_model_fair}")
print(f"Loss ratio (fair model - EXPLORE metric) : {audit_result_fairmodel_explore.lower_bound}. Is model fair: {audit_result_fairmodel_explore.is_model_fair}")
print("-"*100)
print("\t As signified by these numbers, the fair models are fairer than the standard model")
print("="*100)
/Users/mayank/Documents/[Projects]/open-source/inFairness/examples/adult-income-prediction/../../inFairness/auditor/auditor.py:54: RuntimeWarning: invalid value encountered in divide loss_ratio = np.divide(loss_vals_adversarial, loss_vals_original)
==================================================================================================== LR metric Loss ratio (Standard model) : 2.4822924476956905. Is model fair: False Loss ratio (fair model - LogReg metric) : 1.0421434064879227. Is model fair: True Loss ratio (fair model - EXPLORE metric) : 1.026998276114377. Is model fair: True ---------------------------------------------------------------------------------------------------- As signified by these numbers, the fair models are fairer than the standard model ====================================================================================================
# Auditing using the SenSR Auditor + EXPLORE metric
audit_nsteps = 1000
audit_lr = 0.1
auditor_explore = SenSRAuditor(loss_fn=loss_fn, distance_x=distance_x_explore, num_steps=audit_nsteps, lr=audit_lr, max_noise=0.5, min_noise=-0.5)
audit_result_stdmodel = auditor_explore.audit(network_standard, X_test, y_test, lambda_param=10.0, audit_threshold=1.15)
audit_result_fairmodel_LR = auditor_explore.audit(network_fair_LR, X_test, y_test, lambda_param=10.0, audit_threshold=1.15)
audit_result_fairmodel_explore = auditor_explore.audit(network_fair_explore, X_test, y_test, lambda_param=10.0, audit_threshold=1.15)
print("="*100)
print("EXPLORE metric")
print(f"Loss ratio (Standard model) : {audit_result_stdmodel.lower_bound}. Is model fair: {audit_result_stdmodel.is_model_fair}")
print(f"Loss ratio (fair model - LogReg metric) : {audit_result_fairmodel_LR.lower_bound}. Is model fair: {audit_result_fairmodel_LR.is_model_fair}")
print(f"Loss ratio (fair model - EXPLORE metric) : {audit_result_fairmodel_explore.lower_bound}. Is model fair: {audit_result_fairmodel_explore.is_model_fair}")
print("-"*100)
print("\t As signified by these numbers, the fair models are fairer than the standard model")
print("="*100)
==================================================================================================== EXPLORE metric Loss ratio (Standard model) : 3.2874276326186633. Is model fair: False Loss ratio (fair model - LogReg metric) : 1.0897408117340435. Is model fair: True Loss ratio (fair model - EXPLORE metric) : 1.063488311922447. Is model fair: True ---------------------------------------------------------------------------------------------------- As signified by these numbers, the fair models are fairer than the standard model ====================================================================================================