#!/usr/bin/env python # coding: utf-8 # # Fair classifiers with adversarial networks # # Gilles Louppe, 2017. # We illustrate how one can use adversarial networks for building a classifier whose output is forced to be independent of some chosen attribute. We follow the adversarial networks setup described in "Learning to Pivot with Adversarial Networks" (Louppe, Kagan and Cranmer, 2016, [arXiv:1611.01046](https://arxiv.org/abs/1611.01046)). # # In this notebook, we will show more specifically how one can build a fair classifier whose decision is made independent of gender. # ``` # @article{louppe2016pivot, # author = {{Louppe}, G. and {Kagan}, M. and {Cranmer}, K.}, # title = "{Learning to Pivot with Adversarial Networks}", # journal = {ArXiv e-prints}, # archivePrefix = "arXiv", # eprint = {1611.01046}, # primaryClass = "stat.ML", # year = 2016, # month = nov, # } # ``` # In[64]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from IPython import display get_ipython().run_line_magic('matplotlib', 'inline') # # Prepare data # We are using the [adult](https://archive.ics.uci.edu/ml/datasets/Adult) UCI dataset, where the prediction task is to predict whether someone makes over 50,000$ a year. # In[65]: original_data = pd.read_csv( "adult.data.txt", names=["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status", "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss", "Hours per week", "Country", "Target"], sep=r'\s*,\s*', engine='python', na_values="?") original_data.head() # In[66]: data = pd.get_dummies(original_data) target = data["Target_>50K"].values gender = data["Sex_Male"].values del data["Target_<=50K"] del data["Target_>50K"] # In[67]: from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(data, target, gender, train_size=0.5) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # # Standard classifier # We first train a standard neural network on the training data. # In[38]: import keras.backend as K from keras.layers import Input, Dense from keras.models import Model from keras.optimizers import SGD inputs = Input(shape=(X_train.shape[1],)) Dx = Dense(32, activation="relu")(inputs) Dx = Dense(32, activation="relu")(Dx) Dx = Dense(32, activation="relu")(Dx) Dx = Dense(1, activation="sigmoid")(Dx) D = Model(input=[inputs], output=[Dx]) D.compile(loss="binary_crossentropy", optimizer="adam") # In[39]: D.fit(X_train, y_train, nb_epoch=10) # In[40]: from sklearn.metrics import roc_auc_score y_pred = D.predict(X_test) roc_auc_score(y_test, y_pred) # Performance is good, but as the plot below illustrates, the distribution of the classifier output is different depending on gender. In particular, the classifier models that women are less likely to make more than 50,000$ a year than men. # In[43]: plt.hist(y_pred[gender_test == 1], bins=50, histtype="step", normed=1, label="M") plt.hist(y_pred[gender_test == 0], bins=50, histtype="step", normed=1, label="F") plt.ylim(0, 5) plt.legend() plt.grid() plt.show() # The pearson correlation coefficient between gender and the classifier output also clearly highlights this dependency. # In[47]: from scipy.stats import pearsonr pearsonr(gender_test, D.predict(X_test).ravel()) # # Training with adversarial networks # Let us now jointly train our classifier with an adversarial network. The goal of this second network is to predict gender from the classifier output. If this network is doing well, then it clearly indicates that the classifier output is correlated with the attribute. Accordingly, one can force the classifier to distort its decision to make the adversarial network performs worse. This is the strategy we will use. # In[48]: def make_trainable(network, flag): network.trainable = flag for l in network.layers: l.trainable = flag inputs = Input(shape=(X_train.shape[1],)) Dx = Dense(32, activation="relu")(inputs) Dx = Dense(32, activation="relu")(Dx) Dx = Dense(32, activation="relu")(Dx) Dx = Dense(1, activation="sigmoid")(Dx) D = Model(input=[inputs], output=[Dx]) Rx = Dx Rx = Dense(32, activation="relu")(Rx) Rx = Dense(32, activation="relu")(Rx) Rx = Dense(32, activation="relu")(Rx) Rx = Dense(1, activation="sigmoid")(Rx) R = Model(input=[inputs], output=[Rx]) # In[49]: lam = 10.0 # control the trade-off between classification performance and independence def make_loss_D(c): def loss_D(y_true, y_pred): return c * K.binary_crossentropy(y_pred, y_true) return loss_D def make_loss_R(c): def loss_R(z_true, z_pred): return c * K.binary_crossentropy(z_pred, z_true) return loss_R opt_D = SGD() D.compile(loss=[make_loss_D(c=1.0)], optimizer=opt_D) opt_DRf = SGD(momentum=0.0) DRf = Model(input=[inputs], output=[D(inputs), R(inputs)]) make_trainable(R, False) make_trainable(D, True) DRf.compile(loss=[make_loss_D(c=1.0), make_loss_R(c=-lam)], optimizer=opt_DRf) opt_DfR = SGD(momentum=0.0) DfR = Model(input=[inputs], output=[R(inputs)]) make_trainable(R, True) make_trainable(D, False) DfR.compile(loss=[make_loss_R(c=1.0)], optimizer=opt_DfR) # In[52]: # Pretraining of D make_trainable(R, False) make_trainable(D, True) D.fit(X_train, y_train, nb_epoch=10) # In[53]: # Pretraining of R make_trainable(R, True) make_trainable(D, False) DfR.fit(X_train, gender_train, nb_epoch=10) # In[55]: def plot_losses(i, losses): display.clear_output(wait=True) display.display(plt.gcf()) ax1 = plt.subplot(311) values = np.array(losses["L_f"]) plt.plot(range(len(values)), values, label=r"$L_f$", color="blue") plt.legend(loc="upper right") plt.grid() ax2 = plt.subplot(312, sharex=ax1) values = np.array(losses["L_r"]) / lam plt.plot(range(len(values)), values, label=r"$L_r$", color="green") plt.legend(loc="upper right") plt.grid() ax3 = plt.subplot(313, sharex=ax1) values = np.array(losses["L_f - L_r"]) plt.plot(range(len(values)), values, label=r"$L_f - \lambda L_r$", color="red") plt.legend(loc="upper right") plt.grid() plt.show() # In[56]: losses = {"L_f": [], "L_r": [], "L_f - L_r": []} # In[57]: batch_size = 128 for i in range(201): l = DRf.evaluate(X_test, [y_test, gender_test], verbose=0) losses["L_f - L_r"].append(l[0][None][0]) losses["L_f"].append(l[1][None][0]) losses["L_r"].append(-l[2][None][0]) print(losses["L_r"][-1] / lam) if i % 5 == 0: plot_losses(i, losses) # Fit D make_trainable(R, False) make_trainable(D, True) indices = np.random.permutation(len(X_train))[:batch_size] DRf.train_on_batch(X_train[indices], [y_train[indices], gender_train[indices]]) # Fit R make_trainable(R, True) make_trainable(D, False) DfR.fit(X_train, gender_train, batch_size=batch_size, nb_epoch=1, verbose=1) # In[58]: y_pred = D.predict(X_test) roc_auc_score(y_test, y_pred) # Performance is slightly worse, but as the plot and the pearson correlation coefficient show below, the distribution of the classifier output is now almost independent of gender. The classifier is now fair. # In[62]: plt.hist(y_pred[gender_test == 1], bins=50, histtype="step", normed=1, label="M") plt.hist(y_pred[gender_test == 0], bins=50, histtype="step", normed=1, label="F") plt.ylim(0, 5) plt.legend() plt.grid() plt.show() # In[63]: from scipy.stats import pearsonr pearsonr(gender_test, D.predict(X_test).ravel())