Source code for perceptron.benchmarks.carlini_wagner

# Copyright 2019 Baidu Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""C&W2 attack for evaluating model robustness."""

import warnings
import logging
import numpy as np
from tqdm import tqdm
from abc import ABC
from abc import abstractmethod
from .base import Metric
from .base import call_decorator
from perceptron.utils.image import onehot_like
from perceptron.utils.func import to_tanh_space
from perceptron.utils.func import to_model_space
from perceptron.utils.func import AdamOptimizer


class CarliniWagnerMetric(Metric, ABC):
    """The base class of the Carlini & Wagner attack.

    This attack is described in [1]_. This implementation
    is based on the reference implementation by Carlini [2]_.
    For bounds ≠ (0, 1), it differs from [2]_ because we
    normalize the squared L2 loss with the bounds.

    References
    ----------
    .. [1] Nicholas Carlini, David Wagner: "Towards Evaluating the
           Robustness of Neural Networks", https://arxiv.org/abs/1608.04644
    .. [2] https://github.com/carlini/nn_robust_attacks

    """

    @call_decorator
    def __call__(self, adv, annotation=None, unpack=True,
                 binary_search_steps=5, max_iterations=1000,
                 confidence=0, learning_rate=5e-3,
                 initial_const=1e-2, abort_early=True):
        """ The L2 version of the Carlini & Wagner attack.

        Parameters
        ----------
        adv : :class:`Adversarial`
            An :class:`Adversarial` instance
        label : int
            The reference label of the original input.
        unpack : bool
            If true, returns the adversarial input, otherwise returns
            the Adversarial object.
        binary_search_steps : int
            The number of steps for the binary search used to find the
            optimal tradeoff-constant between distance and confidence.
        max_iterations : int
            The maxinum number of iterations. Largert values are more
            accurate; setting it too small will require a large learning
            rate and will produce poor results.
        confidence : int or float
            Confidence of adversarial examples: a higher value produces
            adversarials that are further away, but more strongly classified
            as adversarial.
        learning_rate : float
            The learning rate for the attack algorithm. Smaller values
            produce better results but take longer to converge.
        initial_const : float
            The initial tradeoff-constant to use to tune the relative
            importance of distance and confidenc. If `binary_search_steps`
            is large, the initial constant is not important.
        abort_early : bool
            If True, Adam will be aborted if the loss hasn't decreased for
            some time (a tenth of max_iterations).
        """

        a = adv

        del adv
        del annotation
        del unpack

        if not a.has_gradient():
            logging.fatal('Applied gradient-based attack to model that '
                          'does not provide gradients.')
            return

        min_, max_ = a.bounds()

        if a.model_task() == 'cls':
            loss_and_gradient = self.cls_loss_and_gradient
        elif a.model_task() == 'det':
            loss_and_gradient = self.det_loss_and_gradient
        else:
            raise ValueError('Model task not supported. Check that the'
                             ' task is either cls or det')
        # variables representing inputs in attack space will be
        # prefixed with att_

        att_original = to_tanh_space(a.original_image, min_, max_)

        # will be close but not identical to a.original_image
        reconstructed_original, _ = to_model_space(att_original, min_, max_)

        # the binary search finds the smallest const for which we
        # find an adversarial
        const = initial_const
        lower_bound = 0
        upper_bound = np.inf

        for binary_search_step in tqdm(range(binary_search_steps)):
            if binary_search_step == binary_search_steps - 1 and \
                    binary_search_steps >= 10:
                const = upper_bound

            logging.info('starting optimization with const = {}'.format(const))
            att_perturbation = np.zeros_like(att_original)

            # create a new optimizer to minimize the perturbation
            optimizer = AdamOptimizer(att_perturbation.shape)

            found_adv = False  # found adv with the current const
            loss_at_previous_check = np.inf

            for iteration in range(max_iterations):
                x, dxdp = to_model_space(
                    att_original + att_perturbation, min_, max_)

                loss, gradient, is_adv = loss_and_gradient(
                    const, a, x, dxdp, reconstructed_original,
                    confidence, min_, max_)

                logging.info('iter: {}; loss: {}; best overall distance: {}'.format(
                    iteration, loss, a.distance))

                att_perturbation += optimizer(gradient, learning_rate)

                if is_adv:
                    # this binary search step can be considered a success
                    # but optimization continues to minimize perturbation size
                    found_adv = True

                if abort_early and \
                        iteration % (np.ceil(max_iterations / 10)) == 0:
                    # after each tenth of the iterations, check progress
                    if not (loss <= .9999 * loss_at_previous_check):
                        break  # stop Adam if there has not been progress
                    loss_at_previous_check = loss

            if found_adv:
                logging.info('found adversarial with const = {}'.format(const))
                upper_bound = const
            else:
                logging.info('failed to find adversarial '
                             'with const = {}'.format(const))
                lower_bound = const

            if upper_bound == np.inf:
                 # exponential search
                const *= 10
            else:
                # binary search
                const = (lower_bound + upper_bound) / 2

    @classmethod
    @staticmethod
    def lp_distance_and_grad(reference, other, span):
        """To be extended with different L_p norm."""
        raise NotImplementedError

    @classmethod
    def det_loss_and_gradient(cls, const, a, x, dxdp,
                              reconstructed_original, confidence, min_, max_):
        """Returns the loss and the gradient of the loss w.r.t. x,
        assuming that logits = model(x).
        """

        _, is_adv_loss, is_adv_loss_grad, is_adv = \
            a.predictions_and_gradient(x)

        targeted = a.target_class() is not None
        if targeted:
            c_minimize = a.target_class()
        else:
            raise NotImplementedError

        # is_adv is True as soon as the is_adv_loss goes below 0
        # but sometimes we want additional confidence

        is_adv_loss += confidence
        is_adv_loss = max(0, is_adv_loss)

        s = max_ - min_
        squared_lp_distance, squared_lp_distance_grad = \
            cls.lp_distance_and_grad(reconstructed_original, x, s)

        total_loss = squared_lp_distance + const * is_adv_loss
        total_loss_grad = squared_lp_distance_grad + const * is_adv_loss_grad

        # backprop the gradient of the loss w.r.t. x further
        # to get the gradient of the loss w.r.t. att_perturbation
        assert total_loss_grad.shape == x.shape
        assert dxdp.shape == x.shape
        # we can do a simple elementwise multiplication, because
        # grad_x_wrt_p is a matrix of elementwise derivatives
        # (i.e. each x[i] w.r.t. p[i] only, for all i) and
        # grad_loss_wrt_x is a real gradient reshaped as a matrix
        gradient = total_loss_grad * dxdp

        return total_loss, gradient, is_adv

    @classmethod
    def cls_loss_and_gradient(cls, const, a, x, dxdp,
                              reconstructed_original, confidence, min_, max_):
        """Returns the loss and the gradient of the loss w.r.t. x,
        assuming that logits = model(x).
        """

        logits, is_adv = a.predictions(x)

        targeted = a.target_class() is not None
        if targeted:
            c_minimize = cls.best_other_class(logits, a.target_class())
            c_maximize = a.target_class()
        else:
            c_minimize = a.original_pred
            c_maximize = cls.best_other_class(logits, a.original_pred)

        is_adv_loss = logits[c_minimize] - logits[c_maximize]

        # is_adv is True as soon as the is_adv_loss goes below 0
        # but sometimes we want additional confidence

        is_adv_loss += confidence
        is_adv_loss = max(0, is_adv_loss)

        s = max_ - min_
        lp_distance, lp_distance_grad = \
            cls.lp_distance_and_grad(reconstructed_original, x, s)
        total_loss = lp_distance + const * is_adv_loss

        # calculate the gradient of total_loss w.r.t. x
        logits_diff_grad = np.zeros_like(logits)
        logits_diff_grad[c_minimize] = 1
        logits_diff_grad[c_maximize] = -1
        is_adv_loss_grad = a.backward(logits_diff_grad, x)
        assert is_adv_loss >= 0
        if is_adv_loss == 0:
            is_adv_loss_grad = 0

        total_loss_grad = lp_distance_grad + const * is_adv_loss_grad
        # backprop the gradient of the loss w.r.t. x further
        # to get the gradient of the loss w.r.t. att_perturbation
        assert total_loss_grad.shape == x.shape
        assert dxdp.shape == x.shape
        # we can do a simple elementwise multiplication, because
        # grad_x_wrt_p is a matrix of elementwise derivatives
        # (i.e. each x[i] w.r.t. p[i] only, for all i) and
        # grad_loss_wrt_x is a real gradient reshaped as a matrix
        gradient = total_loss_grad * dxdp

        return total_loss, gradient, is_adv

    @staticmethod
    def best_other_class(logits, exclude):
        """Returns the index of the largest logit, ignoring the class that
        is passed as `exclude`.
        """
        other_logits = logits - onehot_like(logits, exclude, value=np.inf)
        return np.argmax(other_logits)


[docs]class CarliniWagnerL2Metric(CarliniWagnerMetric):
    """The L2 version of C&W attack."""
[docs]    @staticmethod
    def lp_distance_and_grad(reference, other, span):
        """Calculate L2 distance and gradient."""
        squared_l2_distance = np.sum(
            (other - reference) ** 2) / span ** 2
        squared_l2_distance_grad = (2 / span ** 2) * (other - reference)
        return squared_l2_distance, squared_l2_distance_grad


[docs]class CarliniWagnerLinfMetric(CarliniWagnerMetric):
    """The L_inf version of C&W attack."""
[docs]    @staticmethod
    def lp_distance_and_grad(reference, other, span):
        """Calculate L2 distance and gradient."""
        diff = np.abs((other - reference))
        max_diff = np.max(diff)
        l_inf_distance = max_diff / span
        if(max_diff == 0):
            l_inf_distance_grad = np.zeros_like(diff, dtype=np.float32)
        else:
            l_inf_distance_grad = (diff == max_diff).astype(np.float32)
        return l_inf_distance, l_inf_distance_grad