In this tutorial we will show how to run both black-box and white-box inference attacks. This will be demonstrated on the Nursery dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery).
In the case of the nursery dataset, the sensitive feature we want to infer is the 'social' feature. In the original dataset this is a categorical feature with 3 possible values. To make the attack more successful, we reduced this to two possible feature values by assigning the original value 'problematic' the new value 1, and the other original values were assigned the new value 0.
We have also already preprocessed the dataset such that all categorical features are one-hot encoded, and the data was scaled using sklearn's StandardScaler.
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
from art.utils import load_nursery
(x_train, y_train), (x_test, y_test), _, _ = load_nursery(test_set=0.5, transform_social=True)
from sklearn.tree import DecisionTreeClassifier
from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
art_classifier = ScikitlearnDecisionTreeClassifier(model)
print('Base model accuracy: ', model.score(x_test, y_test))
Base model accuracy: 0.9705155912318617
import numpy as np
from art.attacks.inference.attribute_inference import AttributeInferenceBlackBox
attack_train_ratio = 0.5
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_train) * attack_train_ratio)
attack_x_train = x_train[:attack_train_size]
attack_y_train = y_train[:attack_train_size]
attack_x_test = x_train[attack_train_size:]
attack_y_test = y_train[attack_train_size:]
attack_feature = 1 # social
# get original model's predictions
attack_x_test_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(attack_x_test)]).reshape(-1,1)
# only attacked feature
attack_x_test_feature = attack_x_test[:, attack_feature].copy().reshape(-1, 1)
# training data without attacked feature
attack_x_test = np.delete(attack_x_test, attack_feature, 1)
bb_attack = AttributeInferenceBlackBox(art_classifier, attack_feature=attack_feature)
# train attack model
bb_attack.fit(attack_x_train)
# get inferred values
values = [-0.70718864, 1.41404987]
inferred_train_bb = bb_attack.infer(attack_x_test, pred=attack_x_test_predictions, values=values)
# check accuracy
train_acc = np.sum(inferred_train_bb == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_bb)
print(train_acc)
0.5998765050941649
This means that for 59% of the training set, the attacked feature is inferred correctly using this attack.
from art.attacks.inference.attribute_inference import AttributeInferenceWhiteBoxLifestyleDecisionTree
wb_attack = AttributeInferenceWhiteBoxLifestyleDecisionTree(art_classifier, attack_feature=attack_feature)
priors = [3465 / 5183, 1718 / 5183]
# get inferred values
inferred_train_wb1 = wb_attack.infer(attack_x_test, attack_x_test_predictions, values=values, priors=priors)
# check accuracy
train_acc = np.sum(inferred_train_wb1 == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_wb1)
print(train_acc)
0.6288978079654214
from art.attacks.inference.attribute_inference import AttributeInferenceWhiteBoxDecisionTree
wb2_attack = AttributeInferenceWhiteBoxDecisionTree(art_classifier, attack_feature=attack_feature)
# get inferred values
inferred_train_wb2 = wb2_attack.infer(attack_x_test, attack_x_test_predictions, values=values, priors=priors)
# check accuracy
train_acc = np.sum(inferred_train_wb2 == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_wb2)
print(train_acc)
0.7005248533497993
The white-box attacks are able to correctly infer the attacked feature value in 62% and 70% of the training set respectively.
Now let's check the precision and recall:
def calc_precision_recall(predicted, actual, positive_value=1):
score = 0 # both predicted and actual are positive
num_positive_predicted = 0 # predicted positive
num_positive_actual = 0 # actual positive
for i in range(len(predicted)):
if predicted[i] == positive_value:
num_positive_predicted += 1
if actual[i] == positive_value:
num_positive_actual += 1
if predicted[i] == actual[i]:
if predicted[i] == positive_value:
score += 1
if num_positive_predicted == 0:
precision = 1
else:
precision = score / num_positive_predicted # the fraction of predicted “Yes” responses that are correct
if num_positive_actual == 0:
recall = 1
else:
recall = score / num_positive_actual # the fraction of “Yes” responses that are predicted correctly
return precision, recall
# black-box
print(calc_precision_recall(inferred_train_bb, np.around(attack_x_test_feature, decimals=8), positive_value=1.41404987))
# white-box 1
print(calc_precision_recall(inferred_train_wb1, np.around(attack_x_test_feature, decimals=8), positive_value=1.41404987))
# white-box 2
print(calc_precision_recall(inferred_train_wb2, np.around(attack_x_test_feature, decimals=8), positive_value=1.41404987))
(0.34232954545454547, 0.22439478584729983) (0.32320441988950277, 0.10893854748603352) (0.652046783625731, 0.20763500931098697)
To verify the significance of these results, we now run a baseline attack that uses only the remaining features to try to predict the value of the attacked feature, with no use of the model itself.
from art.attacks.inference.attribute_inference import AttributeInferenceBaseline
baseline_attack = AttributeInferenceBaseline(attack_feature=attack_feature)
# train attack model
baseline_attack.fit(attack_x_train)
# infer values
inferred_train_baseline = baseline_attack.infer(attack_x_test, values=values)
# check accuracy
baseline_train_acc = np.sum(inferred_train_baseline == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_baseline)
print(baseline_train_acc)
0.5433775856745909
We can see that both the black-box and white-box attacks do better than the baseline.
In this attack the idea is to find the target feature value that maximizes the membership attack confidence, indicating that this is the most probable value for member samples. It can be based on any membership attack (either black-box or white-box) as long as it supports the given model.
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox
mem_attack = MembershipInferenceBlackBox(art_classifier)
mem_attack.fit(x_train[:attack_train_size], y_train[:attack_train_size], x_test[:attack_test_size], y_test[:attack_test_size])
from art.attacks.inference.attribute_inference import AttributeInferenceMembership
attack = AttributeInferenceMembership(art_classifier, mem_attack, attack_feature=attack_feature)
# infer values
inferred_train = attack.infer(attack_x_test, attack_y_test, values=values)
# check accuracy
train_acc = np.sum(inferred_train == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train)
print(train_acc)
0.6335288669342389
We can see that this attack does slightly better than the regular black-box attack, even though it still assumes only black-box access to the model (employs a black-box membership attack). But it is not as good as the white-box attacks.