Parakeet/parakeet/modules/losses.py

55 lines
1.9 KiB
Python

import numba
import numpy as np
import paddle
from paddle import nn
from paddle.nn import functional as F
def weighted_mean(input, weight):
"""weighted mean.(It can also be used as masked mean.)
Args:
input (Tensor): input tensor, floating point dtype.
weight (Tensor): weight tensor with broadcastable shape.
Returns:
Tensor: shape(1,), weighted mean tensor with the same dtype as input.
"""
weight = paddle.cast(weight, input.dtype)
return paddle.mean(input * weight)
def masked_l1_loss(prediction, target, mask):
abs_error = F.l1_loss(prediction, target, reduction='none')
return weighted_mean(abs_error, mask)
def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
return weighted_mean(ce, mask)
def diagonal_loss(attentions, input_lengths, target_lengths, g=0.2, multihead=False):
"""A metric to evaluate how diagonal a attention distribution is."""
W = guided_attentions(input_lengths, target_lengths, g)
W_tensor = paddle.to_tensor(W)
if not multihead:
return paddle.mean(attentions * W_tensor)
else:
return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
@numba.jit(nopython=True)
def guided_attention(N, max_N, T, max_T, g):
W = np.zeros((max_T, max_N), dtype=np.float32)
for t in range(T):
for n in range(N):
W[t, n] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
# (T_dec, T_enc)
return W
def guided_attentions(input_lengths, target_lengths, g=0.2):
B = len(input_lengths)
max_input_len = input_lengths.max()
max_target_len = target_lengths.max()
W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
for b in range(B):
W[b] = guided_attention(input_lengths[b], max_input_len,
target_lengths[b], max_target_len, g)
# (B, T_dec, T_enc)
return W