In this notebook, we will learn how to use ART to run a clean-label feature collision poisoning attack on a neural network trained with Keras. We will be training our data on a subset of the CIFAR-10 dataset. The methods described are derived from this paper by Shafahi, Huang, et. al. 2018.
import os, sys
from os.path import abspath
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
import warnings
warnings.filterwarnings('ignore')
from keras.models import load_model
from art import config
from art.utils import load_dataset, get_file
from art.estimators.classification import KerasClassifier
from art.attacks.poisoning import FeatureCollisionAttack
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
np.random.seed(301)
Using TensorFlow backend.
(x_train, y_train), (x_test, y_test), min_, max_ = load_dataset('cifar10')
num_samples_train = 1000
num_samples_test = 1000
x_train = x_train[0:num_samples_train]
y_train = y_train[0:num_samples_train]
x_test = x_test[0:num_samples_test]
y_test = y_test[0:num_samples_test]
class_descr = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
In this example, we using a RESNET50 model pretrained on the CIFAR dataset.
path = get_file('cifar_alexnet.h5',extract=False, path=config.ART_DATA_PATH,
url='https://www.dropbox.com/s/ta75pl4krya5djj/cifar_alexnet.h5?dl=1')
classifier_model = load_model(path)
classifier = KerasClassifier(clip_values=(min_, max_), model=classifier_model, use_logits=False,
preprocessing=(0.5, 1))
target_class = "bird" # one of ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
target_label = np.zeros(len(class_descr))
target_label[class_descr.index(target_class)] = 1
target_instance = np.expand_dims(x_test[np.argmax(y_test, axis=1) == class_descr.index(target_class)][3], axis=0)
fig = plt.imshow(target_instance[0])
print('true_class: ' + target_class)
print('predicted_class: ' + class_descr[np.argmax(classifier.predict(target_instance), axis=1)[0]])
feature_layer = classifier.layer_names[-2]
true_class: bird predicted_class: bird
The attacker wants to make it such that whenever a prediction is made on this particular cat the output will be a horse.
base_class = "frog" # one of ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
base_idxs = np.argmax(y_test, axis=1) == class_descr.index(base_class)
base_instances = np.copy(x_test[base_idxs][:10])
base_labels = y_test[base_idxs][:10]
x_test_pred = np.argmax(classifier.predict(base_instances), axis=1)
nb_correct_pred = np.sum(x_test_pred == np.argmax(base_labels, axis=1))
print("New test data to be poisoned (10 images):")
print("Correctly classified: {}".format(nb_correct_pred))
print("Incorrectly classified: {}".format(10-nb_correct_pred))
New test data to be poisoned (10 images): Correctly classified: 9 Incorrectly classified: 1
plt.figure(figsize=(10,10))
for i in range(0, 9):
pred_label, true_label = class_descr[x_test_pred[i]], class_descr[np.argmax(base_labels[i])]
plt.subplot(330 + 1 + i)
fig=plt.imshow(base_instances[i])
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)
fig.axes.text(0.5, -0.1, pred_label + " (" + true_label + ")", fontsize=12, transform=fig.axes.transAxes,
horizontalalignment='center')
The captions on the images can be read: predicted label (true label)
attack = FeatureCollisionAttack(classifier, target_instance, feature_layer, max_iter=10, similarity_coeff=256, watermark=0.3)
poison, poison_labels = attack.poison(base_instances)
100%|██████████| 10/10 [00:01<00:00, 9.76it/s] 100%|██████████| 10/10 [00:00<00:00, 16.56it/s] 100%|██████████| 10/10 [00:00<00:00, 15.97it/s] 100%|██████████| 10/10 [00:00<00:00, 17.06it/s] 100%|██████████| 10/10 [00:00<00:00, 16.40it/s] 100%|██████████| 10/10 [00:00<00:00, 15.85it/s] 100%|██████████| 10/10 [00:00<00:00, 17.06it/s] 100%|██████████| 10/10 [00:00<00:00, 15.25it/s] 100%|██████████| 10/10 [00:00<00:00, 15.91it/s] 100%|██████████| 10/10 [00:00<00:00, 16.80it/s]
poison_pred = np.argmax(classifier.predict(poison), axis=1)
plt.figure(figsize=(10,10))
for i in range(0, 9):
pred_label, true_label = class_descr[poison_pred[i]], class_descr[np.argmax(poison_labels[i])]
plt.subplot(330 + 1 + i)
fig=plt.imshow(poison[i])
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)
fig.axes.text(0.5, -0.1, pred_label + " (" + true_label + ")", fontsize=12, transform=fig.axes.transAxes,
horizontalalignment='center')
Notice how the network classifies most of theses poison examples as frogs, and it's not incorrect to do so. The examples look mostly froggy. A slight watermark of the target instance is also added to push the poisons closer to the target class in feature space.
classifier.set_learning_phase(True)
print(x_train.shape)
print(base_instances.shape)
adv_train = np.vstack([x_train, poison])
adv_labels = np.vstack([y_train, poison_labels])
classifier.fit(adv_train, adv_labels, nb_epochs=5, batch_size=4)
(1000, 32, 32, 3) (10, 32, 32, 3) Epoch 1/5 252/252 [==============================] - 34s 136ms/step - loss: 0.4635 - acc: 0.92061s - loss: 0.4711 - a Epoch 2/5 252/252 [==============================] - 26s 101ms/step - loss: 0.3405 - acc: 0.9325 Epoch 3/5 252/252 [==============================] - 26s 102ms/step - loss: 0.2121 - acc: 0.9534 Epoch 4/5 252/252 [==============================] - 26s 102ms/step - loss: 0.1950 - acc: 0.9742 Epoch 5/5 252/252 [==============================] - 30s 118ms/step - loss: 0.1888 - acc: 0.9712
fig = plt.imshow(target_instance[0])
print('true_class: ' + target_class)
print('predicted_class: ' + class_descr[np.argmax(classifier.predict(target_instance), axis=1)[0]])
true_class: bird predicted_class: frog
These attacks allow adversaries who can poison your dataset the ability to mislabel any particular target instance of their choosing without manipulating labels.