Source code for perceptron.utils.adversarial.base

# Copyright 2019 Baidu Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Provides a class that represents an adversarial example."""

import numpy as np
import numbers
from abc import ABC
from perceptron.utils.distances import MSE
from perceptron.utils.distances import Distance


class StopAttack(Exception):
    """Exception thrown to request early stopping of an attack
    if a given (optional!) threshold is reached.
    """
    pass


[docs]class Adversarial(ABC):
    """Defines the base class of an adversarial that should be found and
    stores the result. The :class:`Adversarial` class represents a single
    adversarial example for a given model, criterion and reference image.
    It can be passed to an adversarial attack to find the actual adversarial.

    Parameters
    ----------
    model : a :class:`Model` instance
        The model that should be evaluated against the adversarial.
    criterion : a :class:`Criterion` instance
        The criterion that determines which images are adversarial.
    original_image : a :class:`numpy.ndarray`
        The original image to which the adversarial image should
        be as close as possible.
    original_pred : int(ClsAdversarial) or dict(DetAdversarial)
        The ground-truth predictions of the original image.
    distance : a :class:`Distance` class
        The measure used to quantify similarity between images.
    threshold : float or :class:`Distance`
        If not None, the attack will stop as soon as the adversarial
        perturbation has a size smaller than this threshold. Can be
        an instance of the :class:`Distance` class passed to the distance
        argument, or a float assumed to have the same unit as the
        the given distance. If None, the attack will simply minimize
        the distance as good as possible. Note that the threshold only
        influences early stopping of the attack; the returned adversarial
        does not necessarily have smaller perturbation size than this
        threshold; the `reached_threshold()` method can be used to check
        if the threshold has been reached.
    """

    def __init__(
            self,
            model,
            criterion,
            original_image,
            original_pred=None,
            threshold=None,
            distance=MSE,
            verbose=False):

        self._model = model
        self._criterion = criterion
        self._original_image = original_image
        self._original_image_for_distance = original_image
        self._original_pred = original_pred
        self._distance = distance

        if threshold is not None and not isinstance(threshold, Distance):
            threshold = distance(value=threshold)
        self._threshold = threshold
        self.verbose = verbose
        self._best_adversarial = None
        self._best_distance = distance(value=np.inf)
        self._best_adversarial_output = None

        self._total_prediction_calls = 0
        self._total_gradient_calls = 0

        self._best_prediction_calls = 0
        self._best_gradient_calls = 0

        # used for attacks that can provide a verifiable bound
        self._verifiable_bounds = (0., 0.)

        # check if the original image is already adversarial
        try:
            self.predictions(original_image)
        except StopAttack:
            # if a threshold is specified and the original input is
            # misclassified, this can already cause a StopAttack
            # exception
            assert self._distance.value == 0.

    def _reset(self):
        self._best_adversarial = None
        self._best_distance = self._distance(value=np.inf)
        self._best_adversarial_output = None

        self._best_prediction_calls = 0
        self._best_gradient_calls = 0

        self.predictions(self._original_image)

    @property
    def verifiable_bounds(self):
        """The verifiable bounds obtained so far."""
        return self._verifiable_bounds

    @verifiable_bounds.setter
    def verifiable_bounds(self, bounds):
        """The setter of verifiable bounds"""
        self._verifiable_bounds = bounds

    @property
    def image(self):
        """The best adversarial found so far."""
        return self._best_adversarial

    @property
    def output(self):
        """The model predictions for the best adversarial found so far.

        None if no adversarial has been found.
        """
        return self._best_adversarial_output

    @property
    def distance(self):
        """The distance of the adversarial input to the original input."""
        return self._best_distance

    @property
    def original_image(self):
        """The original input."""
        return self._original_image

    @property
    def original_pred(self):
        """The original label."""
        return self._original_pred

[docs]    def set_distance_dtype(self, dtype):
        """Set the dtype of Distance."""
        assert dtype >= self._original_image.dtype
        self._original_image_for_distance = self._original_image.astype(
            dtype, copy=False)

[docs]    def reset_distance_dtype(self):
        """Reset the dtype of Distance."""
        self._original_image_for_distance = self._original_image

[docs]    def normalized_distance(self, image):
        """Calculates the distance of a given image to the
        original image.

        Parameters
        ----------
        image : `numpy.ndarray`
            The image that should be compared to the original image.
        Returns
        -------
        :class:`Distance`
            The distance between the given image and the original image.
        """
        return self._distance(
            self._original_image_for_distance,
            image,
            bounds=self.bounds())

[docs]    def reached_threshold(self):
        """Returns True if a threshold is given and the currently
        best adversarial distance is smaller than the threshold."""
        return self._threshold is not None \
            and self._best_distance <= self._threshold

[docs]    def target_class(self):
        """Interface to criterion.target_class for attacks.

        """
        try:
            target_class = self._criterion.target_class()
        except AttributeError:
            target_class = None
        return target_class

[docs]    def num_classes(self):
        """Return number of classes."""
        n = self._model.num_classes()
        assert isinstance(n, numbers.Number)
        return n

[docs]    def bounds(self):
        """Return bounds of model."""
        min_, max_ = self._model.bounds()
        assert isinstance(min_, numbers.Number)
        assert isinstance(max_, numbers.Number)
        assert min_ < max_
        return min_, max_

[docs]    def in_bounds(self, input_):
        """Check if input is in bounds."""
        min_, max_ = self.bounds()
        return min_ <= input_.min() and input_.max() <= max_

[docs]    def channel_axis(self, batch):
        """ Interface to model.channel_axis for attacks.

        Parameters
        ----------
        batch : bool
            Controls whether the index of the axis for a batch of images
            (4 dimensions) or a single image (3 dimensions) should be
            returned.
        """
        axis = self._model.channel_axis()
        if not batch:
            axis = axis - 1
            return axis

[docs]    def has_gradient(self):
        """ Returns true if _backward and _forward_backward can be called
        by an attack, False otherwise.

        """
        try:
            self._model.gradient
            self._model.predictions_and_gradient
        except AttributeError:
            return False
        else:
            return True

    def _new_adversarial(self, image, predictions, in_bounds):
        image = image.copy()  # to prevent accidental inplace changes
        distance = self.normalized_distance(image)
        if in_bounds and self._best_distance > distance:
            # new best adversarial
            if self.verbose:
                print('new best adversarial: {}'.format(distance))

            self._best_adversarial = image
            self._best_distance = distance
            self._best_adversarial_output = predictions

            self._best_prediction_calls = self._total_prediction_calls
            self._best_gradient_calls = self._total_gradient_calls

            if self.reached_threshold():
                raise StopAttack

            return True, distance
        return False, distance

    def _is_adversarial(self, image, predictions, in_bounds):
        """Interface to `criterion.is_adversary()` that calls
        _new_adversarial if necessary.

        Parameters
        ----------
        image : `numpy.ndarray`
            Image with shape (height, width, channels).
        predictions : :class:`numpy.ndarray`
            A vector with the predictions for some image.
        label : int
            The label of the unperturbed reference image.

        """
        is_adversarial = self._criterion.is_adversarial(
            predictions, self._original_pred)
        assert isinstance(is_adversarial, bool) or \
            isinstance(is_adversarial, np.bool_)
        if is_adversarial:
            is_best, distance = self._new_adversarial(
                image, predictions, in_bounds)
        else:
            is_best = False
            distance = None
        return is_adversarial, is_best, distance

[docs]    def predictions(self, image, strict=True, return_details=False):
        """Interface to model.predictions for attacks.

        Parameters
        ----------
        image : `numpy.ndarray`
            Image with shape (height, width, channels).
        strict : bool
            Controls if the bounds for the pixel values should be checked.

        """
        in_bounds = self.in_bounds(image)
        assert not strict or in_bounds

        self._total_prediction_calls += 1
        predictions = self._model.predictions(image)
        is_adversarial, is_best, distance = self._is_adversarial(
            image, predictions, in_bounds)

        if return_details:
            return predictions, is_adversarial, is_best, distance
        else:
            return predictions, is_adversarial

[docs]    def batch_predictions(
            self, images, greedy=False, strict=True, return_details=False):
        """Interface to model.batch_predictions for attacks.

        Parameters
        ----------
        images : `numpy.ndarray`
            Batch of images with shape (batch, height, width, channels).
        greedy : bool
            Whether the first adversarial should be returned.
        strict : bool
            Controls if the bounds for the pixel values should be checked.

        """
        if strict:
            in_bounds = self.in_bounds(images)
            assert in_bounds

        self._total_prediction_calls += len(images)
        predictions = self._model.batch_predictions(images)

        assert predictions.ndim == 2
        assert predictions.shape[0] == images.shape[0]

        if return_details:
            assert greedy

        adversarials = []
        for i in range(len(predictions)):
            if strict:
                in_bounds_i = True
            else:
                in_bounds_i = self.in_bounds(images[i])
            is_adversarial, is_best, distance = self._is_adversarial(
                images[i], predictions[i], in_bounds_i)
            if is_adversarial and greedy:
                if return_details:
                    return predictions, is_adversarial, i, is_best, distance
                else:
                    return predictions, is_adversarial, i
            adversarials.append(is_adversarial)

        if greedy:  # pragma: no cover
            # no adversarial found
            if return_details:
                return predictions, False, None, False, None
            else:
                return predictions, False, None

        is_adversarial = np.array(adversarials)
        assert is_adversarial.ndim == 1
        assert is_adversarial.shape[0] == images.shape[0]

        return predictions, is_adversarial

[docs]    def gradient(self, image=None, label=None, strict=True):
        """Interface to model.gradient for attacks.

        Parameters
        ----------
        image : `numpy.ndarray`
            Image with shape (height, width, channels).
            Defaults to the original image.
        label : int
            Label used to calculate the loss that is differentiated.
            Defaults to the original label.
        strict : bool
            Controls if the bounds for the pixel values should be checked.

        """
        raise NotImplementedError

[docs]    def predictions_and_gradient(
            self, image=None, label=None, strict=True, return_details=False):
        """Interface to model.predictions_and_gradient for attacks.

        Parameters
        ----------
        image : `numpy.ndarray`
            Image with shape (height, width, channels).
            Defaults to the original image.
        label : int
            Label used to calculate the loss that is differentiated.
            Defaults to the original label.
        strict : bool
            Controls if the bounds for the pixel values should be checked.

        """
        raise NotImplementedError

    def backward(self, gradient, image=None, strict=True):
        raise NotImplementedError