Parakeet/parakeet/modules/loss.py

#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from numba import jit

from paddle import fluid
import paddle.fluid.dygraph as dg


def masked_mean(inputs, mask):
    """
    Args:
        inputs (Variable): Shape(B, C, 1, T), the input, where B means
            batch size, C means channels of input, T means timesteps of
            the input.
        mask (Variable): Shape(B, T), a mask.
    Returns:
        loss (Variable): Shape(1, ), masked mean.
    """
    channels = inputs.shape[1]
    reshaped_mask = fluid.layers.reshape(
        mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
    expanded_mask = fluid.layers.expand(
        reshaped_mask, expand_times=[1, channels, 1, 1])
    expanded_mask.stop_gradient = True

    valid_cnt = fluid.layers.reduce_sum(expanded_mask)
    valid_cnt.stop_gradient = True

    masked_inputs = inputs * expanded_mask
    loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
    return loss


@jit(nopython=True)
def guided_attention(N, max_N, T, max_T, g):
    W = np.zeros((max_N, max_T), dtype=np.float32)
    for n in range(N):
        for t in range(T):
            W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
    return W


def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
    B = len(input_lengths)
    max_input_len = input_lengths.max()
    W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
    for b in range(B):
        W[b] = guided_attention(input_lengths[b], max_input_len,
                                target_lengths[b], max_target_len, g).T
    return W


class TTSLoss(object):
    def __init__(self,
                 masked_weight=0.0,
                 priority_weight=0.0,
                 binary_divergence_weight=0.0,
                 guided_attention_sigma=0.2):
        self.masked_weight = masked_weight
        self.priority_weight = priority_weight
        self.binary_divergence_weight = binary_divergence_weight
        self.guided_attention_sigma = guided_attention_sigma

    def l1_loss(self, prediction, target, mask, priority_bin=None):
        abs_diff = fluid.layers.abs(prediction - target)

        # basic mask-weighted l1 loss
        w = self.masked_weight
        if w > 0 and mask is not None:
            base_l1_loss = w * masked_mean(abs_diff, mask) + (
                1 - w) * fluid.layers.reduce_mean(abs_diff)
        else:
            base_l1_loss = fluid.layers.reduce_mean(abs_diff)

        if self.priority_weight > 0 and priority_bin is not None:
            # mask-weighted priority channels' l1-loss
            priority_abs_diff = fluid.layers.slice(
                abs_diff, axes=[1], starts=[0], ends=[priority_bin])
            if w > 0 and mask is not None:
                priority_loss = w * masked_mean(priority_abs_diff, mask) + (
                    1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
            else:
                priority_loss = fluid.layers.reduce_mean(priority_abs_diff)

            # priority weighted sum
            p = self.priority_weight
            loss = p * priority_loss + (1 - p) * base_l1_loss
        else:
            loss = base_l1_loss
        return loss

    def binary_divergence(self, prediction, target, mask):
        flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
        flattened_target = fluid.layers.reshape(target, [-1, 1])
        flattened_loss = fluid.layers.log_loss(
            flattened_prediction, flattened_target, epsilon=1e-8)
        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)

        w = self.masked_weight
        if w > 0 and mask is not None:
            loss = w * masked_mean(bin_div, mask) + (
                1 - w) * fluid.layers.reduce_mean(bin_div)
        else:
            loss = fluid.layers.reduce_mean(bin_div)
        return loss

    @staticmethod
    def done_loss(done_hat, done):
        flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
        flat_done = fluid.layers.reshape(done, [-1, 1])
        loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
        loss = fluid.layers.reduce_mean(loss)
        return loss

    def attention_loss(self, predicted_attention, input_lengths,
                       target_lengths):
        """
        Given valid encoder_lengths and decoder_lengths, compute a diagonal
        guide, and compute loss from the predicted attention and the guide.

        Args:
            predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the
                alignment tensor, where B means batch size, T_dec means number
                of time steps of the decoder, T_enc means the number of time
                steps of the encoder, * means other possible dimensions.
            input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
                (time steps) of encoder outputs.
            target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64,
                valid lengths (time steps) of decoder outputs.

        Returns:
            loss (Variable): Shape(1, ) attention loss.
        """
        n_attention, batch_size, max_target_len, max_input_len = (
            predicted_attention.shape)
        soft_mask = guided_attentions(input_lengths, target_lengths,
                                      max_target_len,
                                      self.guided_attention_sigma)
        soft_mask_ = dg.to_variable(soft_mask)
        loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
        return loss