import numba import numpy as np import paddle from paddle import nn from paddle.nn import functional as F def weighted_mean(input, weight): """weighted mean.(It can also be used as masked mean.) Args: input (Tensor): input tensor, floating point dtype. weight (Tensor): weight tensor with broadcastable shape. Returns: Tensor: shape(1,), weighted mean tensor with the same dtype as input. """ weight = paddle.cast(weight, input.dtype) return paddle.mean(input * weight) def masked_l1_loss(prediction, target, mask): abs_error = F.l1_loss(prediction, target, reduction='none') return weighted_mean(abs_error, mask) def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1): ce = F.softmax_with_cross_entropy(logits, label, axis=axis) return weighted_mean(ce, mask) def diagonal_loss(attentions, input_lengths, target_lengths, g=0.2, multihead=False): """A metric to evaluate how diagonal a attention distribution is.""" W = guided_attentions(input_lengths, target_lengths, g) W_tensor = paddle.to_tensor(W) if not multihead: return paddle.mean(attentions * W_tensor) else: return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1)) @numba.jit(nopython=True) def guided_attention(N, max_N, T, max_T, g): W = np.zeros((max_T, max_N), dtype=np.float32) for t in range(T): for n in range(N): W[t, n] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g)) # (T_dec, T_enc) return W def guided_attentions(input_lengths, target_lengths, g=0.2): B = len(input_lengths) max_input_len = input_lengths.max() max_target_len = target_lengths.max() W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32) for b in range(B): W[b] = guided_attention(input_lengths[b], max_input_len, target_lengths[b], max_target_len, g) # (B, T_dec, T_enc) return W