# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numba import numpy as np import paddle from paddle import nn from paddle.nn import functional as F from paddle.fluid.layers import sequence_mask __all__ = [ "guided_attention_loss", "weighted_mean", "masked_l1_loss", "masked_softmax_with_cross_entropy", ] def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None): """Build that W matrix. shape(B, T_dec, T_enc) W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) See also: Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969. """ dtype = dtype or paddle.get_default_dtype() dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze( -1) # n/N # shape(B, T_dec) enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze( -1) # t/T # shape(B, T_enc) W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 / (2 * g**2)) dec_mask = sequence_mask(dec_lens, maxlen=N) enc_mask = sequence_mask(enc_lens, maxlen=T) mask = dec_mask.unsqueeze(-1) * enc_mask.unsqueeze(1) mask = paddle.cast(mask, W.dtype) W *= mask return W def guided_attention_loss(attention_weight, dec_lens, enc_lens, g): """Guided attention loss, masked to excluded padding parts.""" _, N, T = attention_weight.shape W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype) total_tokens = (dec_lens * enc_lens).astype(W.dtype) loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens) return loss def weighted_mean(input, weight): """Weighted mean. It can also be used as masked mean. Parameters ----------- input : Tensor The input tensor. weight : Tensor The weight tensor with broadcastable shape with the input. Returns ---------- Tensor [shape=(1,)] Weighted mean tensor with the same dtype as input. """ weight = paddle.cast(weight, input.dtype) broadcast_ratio = input.size / weight.size return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio) def masked_l1_loss(prediction, target, mask): """Compute maksed L1 loss. Parameters ---------- prediction : Tensor The prediction. target : Tensor The target. The shape should be broadcastable to ``prediction``. mask : Tensor The mask. The shape should be broadcatable to the broadcasted shape of ``prediction`` and ``target``. Returns ------- Tensor [shape=(1,)] The masked L1 loss. """ abs_error = F.l1_loss(prediction, target, reduction='none') loss = weighted_mean(abs_error, mask) return loss def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1): """Compute masked softmax with cross entropy loss. Parameters ---------- logits : Tensor The logits. The ``axis``-th axis is the class dimension. label : Tensor [dtype: int] The label. The size of the ``axis``-th axis should be 1. mask : Tensor The mask. The shape should be broadcastable to ``label``. axis : int, optional The index of the class dimension in the shape of ``logits``, by default -1. Returns ------- Tensor [shape=(1,)] The masked softmax with cross entropy loss. """ ce = F.softmax_with_cross_entropy(logits, label, axis=axis) loss = weighted_mean(ce, mask) return loss