#!/usr/bin/env python
# coding: utf-8

# # Fair classifiers with adversarial networks
# 
# Gilles Louppe, 2017.

# We illustrate how one can use adversarial networks for building a classifier whose output is forced to be independent of some chosen attribute. We follow the adversarial networks setup described in "Learning to Pivot with Adversarial Networks" (Louppe, Kagan and Cranmer, 2016, [arXiv:1611.01046](https://arxiv.org/abs/1611.01046)).
# 
# In this notebook, we will show more specifically how one can build a fair classifier whose decision is made independent of gender.

# ```
# @article{louppe2016pivot,
#            author = {{Louppe}, G. and {Kagan}, M. and {Cranmer}, K.},
#             title = "{Learning to Pivot with Adversarial Networks}",
#           journal = {ArXiv e-prints},
#     archivePrefix = "arXiv",
#            eprint = {1611.01046},
#      primaryClass = "stat.ML",
#              year = 2016,
#             month = nov,
# }
# ```

# In[64]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display
get_ipython().run_line_magic('matplotlib', 'inline')


# # Prepare data

# We are using the [adult](https://archive.ics.uci.edu/ml/datasets/Adult) UCI dataset, where the prediction task is to predict whether someone makes over 50,000$ a year.

# In[65]:


original_data = pd.read_csv(
    "adult.data.txt", 
    names=["Age", "Workclass", "fnlwgt", "Education", "Education-Num", 
           "Martial Status", "Occupation", "Relationship", "Race", "Sex", 
           "Capital Gain", "Capital Loss", "Hours per week", "Country", "Target"],
    sep=r'\s*,\s*', engine='python', na_values="?")
original_data.head()


# In[66]:


data = pd.get_dummies(original_data)
target = data["Target_>50K"].values
gender = data["Sex_Male"].values
del data["Target_<=50K"]
del data["Target_>50K"]


# In[67]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(data, target, gender, train_size=0.5)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# # Standard classifier

# We first train a standard neural network on the training data.

# In[38]:


import keras.backend as K
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import SGD

inputs = Input(shape=(X_train.shape[1],))
Dx = Dense(32, activation="relu")(inputs)
Dx = Dense(32, activation="relu")(Dx)
Dx = Dense(32, activation="relu")(Dx)
Dx = Dense(1, activation="sigmoid")(Dx)
D = Model(input=[inputs], output=[Dx])
D.compile(loss="binary_crossentropy", optimizer="adam")


# In[39]:


D.fit(X_train, y_train, nb_epoch=10)


# In[40]:


from sklearn.metrics import roc_auc_score
y_pred = D.predict(X_test)
roc_auc_score(y_test, y_pred)


# Performance is good, but as the plot below illustrates, the distribution of the classifier output is different depending on gender. In particular, the classifier models that women are less likely to make more than 50,000$ a year than men.

# In[43]:


plt.hist(y_pred[gender_test == 1], bins=50, histtype="step",  normed=1, label="M")
plt.hist(y_pred[gender_test == 0], bins=50, histtype="step", normed=1, label="F")
plt.ylim(0, 5)
plt.legend()
plt.grid()
plt.show()


# The pearson correlation coefficient between gender and the classifier output also clearly highlights this dependency.

# In[47]:


from scipy.stats import pearsonr
pearsonr(gender_test, D.predict(X_test).ravel())


# # Training with adversarial networks

# Let us now jointly train our classifier with an adversarial network. The goal of this second network is to predict gender from the classifier output. If this network is doing well, then it clearly indicates that the classifier output is correlated with the attribute. Accordingly, one can force the classifier to distort its decision to make the adversarial network performs worse. This is the strategy we will use.

# In[48]:


def make_trainable(network, flag):
    network.trainable = flag
    for l in network.layers:
        l.trainable = flag

inputs = Input(shape=(X_train.shape[1],))

Dx = Dense(32, activation="relu")(inputs)
Dx = Dense(32, activation="relu")(Dx)
Dx = Dense(32, activation="relu")(Dx)
Dx = Dense(1, activation="sigmoid")(Dx)
D = Model(input=[inputs], output=[Dx])

Rx = Dx
Rx = Dense(32, activation="relu")(Rx)
Rx = Dense(32, activation="relu")(Rx)
Rx = Dense(32, activation="relu")(Rx)
Rx = Dense(1, activation="sigmoid")(Rx)
R = Model(input=[inputs], output=[Rx])


# In[49]:


lam = 10.0  # control the trade-off between classification performance and independence

def make_loss_D(c):
    def loss_D(y_true, y_pred):
        return c * K.binary_crossentropy(y_pred, y_true)
    return loss_D

def make_loss_R(c):
    def loss_R(z_true, z_pred):
        return c * K.binary_crossentropy(z_pred, z_true)
    return loss_R

opt_D = SGD()
D.compile(loss=[make_loss_D(c=1.0)], optimizer=opt_D)

opt_DRf = SGD(momentum=0.0)
DRf = Model(input=[inputs], output=[D(inputs), R(inputs)])
make_trainable(R, False)
make_trainable(D, True)
DRf.compile(loss=[make_loss_D(c=1.0), make_loss_R(c=-lam)], optimizer=opt_DRf)

opt_DfR = SGD(momentum=0.0)
DfR = Model(input=[inputs], output=[R(inputs)])
make_trainable(R, True)
make_trainable(D, False)
DfR.compile(loss=[make_loss_R(c=1.0)], optimizer=opt_DfR)


# In[52]:


# Pretraining of D
make_trainable(R, False)
make_trainable(D, True)
D.fit(X_train, y_train, nb_epoch=10)


# In[53]:


# Pretraining of R
make_trainable(R, True)
make_trainable(D, False)
DfR.fit(X_train, gender_train, nb_epoch=10)


# In[55]:


def plot_losses(i, losses):
    display.clear_output(wait=True)
    display.display(plt.gcf())

    ax1 = plt.subplot(311)   
    values = np.array(losses["L_f"])
    plt.plot(range(len(values)), values, label=r"$L_f$", color="blue")
    plt.legend(loc="upper right")
    plt.grid()
    
    ax2 = plt.subplot(312, sharex=ax1) 
    values = np.array(losses["L_r"]) / lam
    plt.plot(range(len(values)), values, label=r"$L_r$", color="green")
    plt.legend(loc="upper right")
    plt.grid()
    
    ax3 = plt.subplot(313, sharex=ax1)
    values = np.array(losses["L_f - L_r"])
    plt.plot(range(len(values)), values, label=r"$L_f - \lambda L_r$", color="red")  
    plt.legend(loc="upper right")
    plt.grid()
    
    plt.show()  


# In[56]:


losses = {"L_f": [], "L_r": [], "L_f - L_r": []}


# In[57]:


batch_size = 128

for i in range(201):
    l = DRf.evaluate(X_test, [y_test, gender_test], verbose=0)    
    losses["L_f - L_r"].append(l[0][None][0])
    losses["L_f"].append(l[1][None][0])
    losses["L_r"].append(-l[2][None][0])
    print(losses["L_r"][-1] / lam)
    
    if i % 5 == 0:
        plot_losses(i, losses)

    # Fit D
    make_trainable(R, False)
    make_trainable(D, True)
    indices = np.random.permutation(len(X_train))[:batch_size]
    DRf.train_on_batch(X_train[indices], [y_train[indices], gender_train[indices]])
        
    # Fit R
    make_trainable(R, True)
    make_trainable(D, False)
    DfR.fit(X_train, gender_train, batch_size=batch_size, nb_epoch=1, verbose=1)


# In[58]:


y_pred = D.predict(X_test)
roc_auc_score(y_test, y_pred)


# Performance is slightly worse, but as the plot and the pearson correlation coefficient show below, the distribution of the classifier output is now almost independent of gender. The classifier is now fair.

# In[62]:


plt.hist(y_pred[gender_test == 1], bins=50, histtype="step",  normed=1, label="M")
plt.hist(y_pred[gender_test == 0], bins=50, histtype="step", normed=1, label="F")
plt.ylim(0, 5)
plt.legend()
plt.grid()
plt.show()


# In[63]:


from scipy.stats import pearsonr
pearsonr(gender_test, D.predict(X_test).ravel())