format docstrings
This commit is contained in:
parent
3af3c29a94
commit
6553d1d723
|
@ -12,28 +12,26 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fastspeech2 related modules for paddle"""
|
||||
import logging
|
||||
import numpy as np
|
||||
|
||||
from typing import Dict
|
||||
from typing import Sequence
|
||||
from typing import Tuple
|
||||
|
||||
from typeguard import check_argument_types
|
||||
|
||||
import paddle
|
||||
import numpy as np
|
||||
from paddle import nn
|
||||
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
|
||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
|
||||
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
|
||||
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
|
||||
from parakeet.modules.fastspeech2_predictor.postnet import Postnet
|
||||
from parakeet.modules.nets_utils import make_non_pad_mask
|
||||
from parakeet.modules.nets_utils import make_pad_mask
|
||||
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
|
||||
from parakeet.modules.nets_utils import initialize
|
||||
from parakeet.modules.nets_utils import make_non_pad_mask
|
||||
from parakeet.modules.nets_utils import make_pad_mask
|
||||
|
||||
|
||||
class FastSpeech2(nn.Layer):
|
||||
|
@ -155,7 +153,6 @@ class FastSpeech2(nn.Layer):
|
|||
positionwise_layer_type=positionwise_layer_type,
|
||||
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
|
||||
else:
|
||||
print("encoder_type:", encoder_type)
|
||||
raise ValueError(f"{encoder_type} is not supported.")
|
||||
|
||||
# define duration predictor
|
||||
|
@ -236,6 +233,12 @@ class FastSpeech2(nn.Layer):
|
|||
use_batch_norm=use_batch_norm,
|
||||
dropout_rate=postnet_dropout_rate, ))
|
||||
|
||||
# initialize parameters
|
||||
self._reset_parameters(
|
||||
init_type=init_type,
|
||||
init_enc_alpha=init_enc_alpha,
|
||||
init_dec_alpha=init_dec_alpha, )
|
||||
|
||||
# define criterions
|
||||
self.criterion = FastSpeech2Loss(
|
||||
use_masking=use_masking, use_weighted_masking=use_weighted_masking)
|
||||
|
@ -253,25 +256,37 @@ class FastSpeech2(nn.Layer):
|
|||
energy: paddle.Tensor,
|
||||
energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[
|
||||
str, paddle.Tensor], paddle.Tensor]:
|
||||
# """Calculate forward propagation.
|
||||
"""Calculate forward propagation.
|
||||
|
||||
# Args:
|
||||
# text (LongTensor): Batch of padded token ids (B, Tmax).
|
||||
# text_lengths (LongTensor): Batch of lengths of each input (B,).
|
||||
# speech (Tensor): Batch of padded target features (B, Lmax, odim).
|
||||
# speech_lengths (LongTensor): Batch of the lengths of each target (B,).
|
||||
# durations (LongTensor): Batch of padded durations (B, Tmax + 1).
|
||||
# durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1).
|
||||
# pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
|
||||
# pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1).
|
||||
# energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
|
||||
# energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1).
|
||||
# Returns:
|
||||
# Tensor: Loss scalar value.
|
||||
# Dict: Statistics to be monitored.
|
||||
# Tensor: Weight value.
|
||||
|
||||
# """
|
||||
Parameters
|
||||
----------
|
||||
text : LongTensor
|
||||
Batch of padded token ids (B, Tmax).
|
||||
text_lengths : LongTensor)
|
||||
Batch of lengths of each input (B,).
|
||||
speech : Tensor
|
||||
Batch of padded target features (B, Lmax, odim).
|
||||
speech_lengths : LongTensor
|
||||
Batch of the lengths of each target (B,).
|
||||
durations : LongTensor
|
||||
Batch of padded durations (B, Tmax + 1).
|
||||
durations_lengths : LongTensor
|
||||
Batch of duration lengths (B, Tmax + 1).
|
||||
pitch : Tensor
|
||||
Batch of padded token-averaged pitch (B, Tmax + 1, 1).
|
||||
pitch_lengths : LongTensor
|
||||
Batch of pitch lengths (B, Tmax + 1).
|
||||
energy : Tensor
|
||||
Batch of padded token-averaged energy (B, Tmax + 1, 1).
|
||||
energy_lengths : LongTensor
|
||||
Batch of energy lengths (B, Tmax + 1).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Loss scalar value.
|
||||
Dict
|
||||
Statistics to be monitored.
|
||||
"""
|
||||
text = text[:, :text_lengths.max()] # for data-parallel
|
||||
speech = speech[:, :speech_lengths.max()] # for data-parallel
|
||||
durations = durations[:, :durations_lengths.max()] # for data-parallel
|
||||
|
@ -282,16 +297,11 @@ class FastSpeech2(nn.Layer):
|
|||
|
||||
# Add eos at the last of sequence
|
||||
# xs = F.pad(text, [0, 1], "constant", self.padding_idx)
|
||||
print("xs.shape in fastspeech2.py before:", text.shape, text)
|
||||
xs = np.pad(text.numpy(),
|
||||
pad_width=((0, 0), (0, 1)),
|
||||
mode="constant",
|
||||
constant_values=self.padding_idx)
|
||||
xs = paddle.to_tensor(xs)
|
||||
print("xs.shape in fastspeech2.py end:", xs.shape, xs)
|
||||
# my_pad = nn.Pad1D(padding=[0, 1], mode="constant", value=self.padding_idx)
|
||||
# xs = my_pad(text)
|
||||
# 是否会数组越界? xs 是否能取到 l -> 可以,因为上一步补充了一个 padding_idx,又变成了 eos
|
||||
for i, l in enumerate(text_lengths):
|
||||
xs[i, l] = self.eos
|
||||
ilens = text_lengths + 1
|
||||
|
@ -302,23 +312,16 @@ class FastSpeech2(nn.Layer):
|
|||
# forward propagation
|
||||
before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
|
||||
xs, ilens, ys, olens, ds, ps, es, is_inference=False)
|
||||
print("d_outs in paddle:", d_outs)
|
||||
print("p_outs in paddle:", p_outs)
|
||||
print("e_outs in paddle:", e_outs)
|
||||
|
||||
# modify mod part of groundtruth
|
||||
if self.reduction_factor > 1:
|
||||
# 需要改
|
||||
olens = paddle.to_tensor([
|
||||
olen - olen % self.reduction_factor for olen in olens.numpy()
|
||||
])
|
||||
max_olen = max(olens)
|
||||
ys = ys[:, :max_olen]
|
||||
|
||||
# calculate loss
|
||||
if self.postnet is None:
|
||||
after_outs = None
|
||||
|
||||
# calculate loss
|
||||
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
|
||||
after_outs=after_outs,
|
||||
|
@ -363,9 +366,8 @@ class FastSpeech2(nn.Layer):
|
|||
alpha: float=1.0, ) -> Sequence[paddle.Tensor]:
|
||||
# forward encoder
|
||||
x_masks = self._source_mask(ilens)
|
||||
print("xs.shape in fastspeech2.py:", xs.shape)
|
||||
hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim)
|
||||
|
||||
hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim)
|
||||
# forward duration predictor and variance predictors
|
||||
d_masks = make_pad_mask(ilens)
|
||||
|
||||
|
@ -377,10 +379,11 @@ class FastSpeech2(nn.Layer):
|
|||
e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
|
||||
else:
|
||||
e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
|
||||
print("p_outs.shape:", p_outs.shape)
|
||||
|
||||
if is_inference:
|
||||
d_outs = self.duration_predictor.inference(hs,
|
||||
d_masks) # (B, Tmax)
|
||||
# print("d_outs:",d_outs)
|
||||
# use prediction in inference
|
||||
# (B, Tmax, 1)
|
||||
|
||||
|
@ -404,7 +407,6 @@ class FastSpeech2(nn.Layer):
|
|||
# forward decoder
|
||||
if olens is not None and not is_inference:
|
||||
if self.reduction_factor > 1:
|
||||
# 直接to_paddle ,维度会增加 1,需要先转成 numpy
|
||||
olens_in = paddle.to_tensor(
|
||||
[olen // self.reduction_factor for olen in olens.numpy()])
|
||||
else:
|
||||
|
@ -412,9 +414,10 @@ class FastSpeech2(nn.Layer):
|
|||
h_masks = self._source_mask(olens_in)
|
||||
else:
|
||||
h_masks = None
|
||||
zs, _ = self.decoder(hs, h_masks) # (B, Lmax, adim)
|
||||
before_outs = self.feat_out(zs).reshape(
|
||||
(zs.shape[0], -1, self.odim)) # (B, Lmax, odim)
|
||||
# (B, Lmax, adim)
|
||||
zs, _ = self.decoder(hs, h_masks)
|
||||
# (B, Lmax, odim)
|
||||
before_outs = self.feat_out(zs).reshape((zs.shape[0], -1, self.odim))
|
||||
|
||||
# postnet -> (B, Lmax//r * r, odim)
|
||||
if self.postnet is None:
|
||||
|
@ -437,20 +440,30 @@ class FastSpeech2(nn.Layer):
|
|||
paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||
"""Generate the sequence of features given the sequences of characters.
|
||||
|
||||
Args:
|
||||
text (LongTensor): Input sequence of characters (T,).
|
||||
speech (Tensor, optional): Feature sequence to extract style (N, idim).
|
||||
durations (LongTensor, optional): Groundtruth of duration (T + 1,).
|
||||
pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
|
||||
energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
|
||||
alpha (float, optional): Alpha to control the speed.
|
||||
use_teacher_forcing (bool, optional): Whether to use teacher forcing.
|
||||
If true, groundtruth of duration, pitch and energy will be used.
|
||||
Parameters
|
||||
----------
|
||||
text : LongTensor
|
||||
Input sequence of characters (T,).
|
||||
speech : Tensor, optional
|
||||
Feature sequence to extract style (N, idim).
|
||||
durations : LongTensor, optional
|
||||
Groundtruth of duration (T + 1,).
|
||||
pitch : Tensor, optional
|
||||
Groundtruth of token-averaged pitch (T + 1, 1).
|
||||
energy : Tensor, optional
|
||||
Groundtruth of token-averaged energy (T + 1, 1).
|
||||
alpha : float, optional
|
||||
Alpha to control the speed.
|
||||
use_teacher_forcing : bool, optional
|
||||
Whether to use teacher forcing.
|
||||
If true, groundtruth of duration, pitch and energy will be used.
|
||||
|
||||
Returns:
|
||||
Tensor: Output sequence of features (L, odim).
|
||||
None: Dummy for compatibility.
|
||||
None: Dummy for compatibility.
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Output sequence of features (L, odim).
|
||||
None
|
||||
Dummy for compatibility.
|
||||
|
||||
"""
|
||||
x, y = text, speech
|
||||
|
@ -460,13 +473,15 @@ class FastSpeech2(nn.Layer):
|
|||
x = np.pad(text.numpy(),
|
||||
pad_width=((0, 1)),
|
||||
mode="constant",
|
||||
constant_values=self.padding_idx)
|
||||
constant_values=self.eos)
|
||||
|
||||
x = paddle.to_tensor(x)
|
||||
|
||||
# setup batch axis
|
||||
ilens = paddle.to_tensor(
|
||||
[x.shape[0]], dtype=paddle.int64, place=x.place)
|
||||
xs, ys = x.unsqueeze(0), None
|
||||
|
||||
if y is not None:
|
||||
ys = y.unsqueeze(0)
|
||||
|
||||
|
@ -493,14 +508,19 @@ class FastSpeech2(nn.Layer):
|
|||
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
|
||||
"""Make masks for self-attention.
|
||||
|
||||
Args:
|
||||
ilens (LongTensor): Batch of lengths (B,).
|
||||
Parameters
|
||||
----------
|
||||
ilens : LongTensor
|
||||
Batch of lengths (B,).
|
||||
|
||||
Returns:
|
||||
Tensor: Mask tensor for self-attention.
|
||||
Returns
|
||||
-------
|
||||
Tensor
|
||||
Mask tensor for self-attention.
|
||||
dtype=paddle.bool
|
||||
|
||||
Examples:
|
||||
Examples
|
||||
-------
|
||||
>>> ilens = [5, 3]
|
||||
>>> self._source_mask(ilens)
|
||||
tensor([[[1, 1, 1, 1, 1],
|
||||
|
@ -510,6 +530,29 @@ class FastSpeech2(nn.Layer):
|
|||
x_masks = make_non_pad_mask(ilens)
|
||||
return x_masks.unsqueeze(-2)
|
||||
|
||||
def _reset_parameters(self,
|
||||
init_type: str,
|
||||
init_enc_alpha: float,
|
||||
init_dec_alpha: float):
|
||||
# initialize parameters
|
||||
initialize(self, init_type)
|
||||
|
||||
# initialize alpha in scaled positional encoding
|
||||
if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
|
||||
init_enc_alpha = paddle.to_tensor(init_enc_alpha)
|
||||
self.encoder.embed[-1].alpha = paddle.create_parameter(
|
||||
shape=init_enc_alpha.shape,
|
||||
dtype=str(init_enc_alpha.numpy().dtype),
|
||||
default_initializer=paddle.nn.initializer.Assign(
|
||||
init_enc_alpha))
|
||||
if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
|
||||
init_dec_alpha = paddle.to_tensor(init_dec_alpha)
|
||||
self.decoder.embed[-1].alpha = paddle.create_parameter(
|
||||
shape=init_dec_alpha.shape,
|
||||
dtype=str(init_dec_alpha.numpy().dtype),
|
||||
default_initializer=paddle.nn.initializer.Assign(
|
||||
init_dec_alpha))
|
||||
|
||||
|
||||
class FastSpeech2Loss(nn.Layer):
|
||||
"""Loss function module for FastSpeech2."""
|
||||
|
@ -519,12 +562,12 @@ class FastSpeech2Loss(nn.Layer):
|
|||
use_weighted_masking: bool=False):
|
||||
"""Initialize feed-forward Transformer loss module.
|
||||
|
||||
Args:
|
||||
use_masking (bool):
|
||||
Parameters
|
||||
----------
|
||||
use_masking : bool
|
||||
Whether to apply masking for padded part in loss calculation.
|
||||
use_weighted_masking (bool):
|
||||
use_weighted_masking : bool
|
||||
Whether to weighted masking in loss calculation.
|
||||
|
||||
"""
|
||||
assert check_argument_types()
|
||||
super().__init__()
|
||||
|
@ -555,24 +598,41 @@ class FastSpeech2Loss(nn.Layer):
|
|||
paddle.Tensor, paddle.Tensor]:
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
|
||||
before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
|
||||
d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
|
||||
p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||
e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
|
||||
ys (Tensor): Batch of target features (B, Lmax, odim).
|
||||
ds (LongTensor): Batch of durations (B, Tmax).
|
||||
ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
|
||||
es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
|
||||
ilens (LongTensor): Batch of the lengths of each input (B,).
|
||||
olens (LongTensor): Batch of the lengths of each target (B,).
|
||||
Parameters
|
||||
----------
|
||||
after_outs : Tensor
|
||||
Batch of outputs after postnets (B, Lmax, odim).
|
||||
before_outs : Tensor
|
||||
Batch of outputs before postnets (B, Lmax, odim).
|
||||
d_outs : LongTensor
|
||||
Batch of outputs of duration predictor (B, Tmax).
|
||||
p_outs : Tensor
|
||||
Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||
e_outs : Tensor
|
||||
Batch of outputs of energy predictor (B, Tmax, 1).
|
||||
ys : Tensor
|
||||
Batch of target features (B, Lmax, odim).
|
||||
ds : LongTensor
|
||||
Batch of durations (B, Tmax).
|
||||
ps : Tensor
|
||||
Batch of target token-averaged pitch (B, Tmax, 1).
|
||||
es : Tensor
|
||||
Batch of target token-averaged energy (B, Tmax, 1).
|
||||
ilens : LongTensor
|
||||
Batch of the lengths of each input (B,).
|
||||
olens : LongTensor
|
||||
Batch of the lengths of each target (B,).
|
||||
|
||||
Returns:
|
||||
Tensor: L1 loss value.
|
||||
Tensor: Duration predictor loss value.
|
||||
Tensor: Pitch predictor loss value.
|
||||
Tensor: Energy predictor loss value.
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
L1 loss value.
|
||||
Tensor
|
||||
Duration predictor loss value.
|
||||
Tensor
|
||||
Pitch predictor loss value.
|
||||
Tensor
|
||||
Energy predictor loss value.
|
||||
|
||||
"""
|
||||
# apply mask to remove padded part
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from parakeet.modules.layer_norm import LayerNorm
|
||||
from parakeet.modules.masked_fill import masked_fill
|
||||
|
||||
|
@ -31,7 +30,8 @@ class DurationPredictor(nn.Layer):
|
|||
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
|
||||
https://arxiv.org/pdf/1905.09263.pdf
|
||||
|
||||
Note:
|
||||
Note
|
||||
----------
|
||||
The calculation domain of outputs is different
|
||||
between in `forward` and in `inference`. In `forward`,
|
||||
the outputs are calculated in log domain but in `inference`,
|
||||
|
@ -48,13 +48,20 @@ class DurationPredictor(nn.Layer):
|
|||
offset=1.0):
|
||||
"""Initilize duration predictor module.
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
n_layers (int, optional): Number of convolutional layers.
|
||||
n_chans (int, optional): Number of channels of convolutional layers.
|
||||
kernel_size (int, optional): Kernel size of convolutional layers.
|
||||
dropout_rate (float, optional): Dropout rate.
|
||||
offset (float, optional): Offset value to avoid nan in log domain.
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
n_layers : int, optional
|
||||
Number of convolutional layers.
|
||||
n_chans : int, optional
|
||||
Number of channels of convolutional layers.
|
||||
kernel_size : int, optional
|
||||
Kernel size of convolutional layers.
|
||||
dropout_rate : float, optional
|
||||
Dropout rate.
|
||||
offset : float, optional
|
||||
Offset value to avoid nan in log domain.
|
||||
|
||||
"""
|
||||
super(DurationPredictor, self).__init__()
|
||||
|
@ -74,7 +81,7 @@ class DurationPredictor(nn.Layer):
|
|||
LayerNorm(
|
||||
n_chans, dim=1),
|
||||
nn.Dropout(dropout_rate), ))
|
||||
self.linear = nn.Linear(n_chans, 1)
|
||||
self.linear = nn.Linear(n_chans, 1, bias_attr=True)
|
||||
|
||||
def _forward(self, xs, x_masks=None, is_inference=False):
|
||||
# (B, idim, Tmax)
|
||||
|
@ -83,7 +90,7 @@ class DurationPredictor(nn.Layer):
|
|||
for f in self.conv:
|
||||
xs = f(xs)
|
||||
|
||||
# NOTE: calculate in log domain
|
||||
# NOTE: calculate in log domain
|
||||
# (B, Tmax)
|
||||
xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1)
|
||||
|
||||
|
@ -99,28 +106,34 @@ class DurationPredictor(nn.Layer):
|
|||
def forward(self, xs, x_masks=None):
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
||||
x_masks (ByteTensor, optional):
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : ByteTensor, optional
|
||||
Batch of masks indicating padded part (B, Tmax).
|
||||
|
||||
Returns:
|
||||
Tensor: Batch of predicted durations in log domain (B, Tmax).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Batch of predicted durations in log domain (B, Tmax).
|
||||
"""
|
||||
return self._forward(xs, x_masks, False)
|
||||
|
||||
def inference(self, xs, x_masks=None):
|
||||
"""Inference duration.
|
||||
|
||||
Args:
|
||||
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
||||
x_masks (ByteTensor, optional):
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : Tensor(bool), optional
|
||||
Batch of masks indicating padded part (B, Tmax).
|
||||
|
||||
Returns:
|
||||
LongTensor: Batch of predicted durations in linear domain int64 (B, Tmax).
|
||||
|
||||
Returns
|
||||
----------
|
||||
LongTensor
|
||||
Batch of predicted durations in linear domain int64 (B, Tmax).
|
||||
"""
|
||||
return self._forward(xs, x_masks, True)
|
||||
|
||||
|
@ -135,10 +148,12 @@ class DurationPredictorLoss(nn.Layer):
|
|||
def __init__(self, offset=1.0, reduction="mean"):
|
||||
"""Initilize duration predictor loss module.
|
||||
|
||||
Args:
|
||||
offset (float, optional): Offset value to avoid nan in log domain.
|
||||
reduction (str): Reduction type in loss calculation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offset : float, optional
|
||||
Offset value to avoid nan in log domain.
|
||||
reduction : str
|
||||
Reduction type in loss calculation.
|
||||
"""
|
||||
super(DurationPredictorLoss, self).__init__()
|
||||
self.criterion = nn.MSELoss(reduction=reduction)
|
||||
|
@ -147,16 +162,21 @@ class DurationPredictorLoss(nn.Layer):
|
|||
def forward(self, outputs, targets):
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
outputs (Tensor): Batch of prediction durations in log domain (B, T)
|
||||
targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
|
||||
Parameters
|
||||
----------
|
||||
outputs : Tensor
|
||||
Batch of prediction durations in log domain (B, T)
|
||||
targets : LongTensor
|
||||
Batch of groundtruth durations in linear domain (B, T)
|
||||
|
||||
Returns:
|
||||
Tensor: Mean squared error loss value.
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Mean squared error loss value.
|
||||
|
||||
Note:
|
||||
Note
|
||||
----------
|
||||
`outputs` is in log domain but `targets` is in linear domain.
|
||||
|
||||
"""
|
||||
# NOTE: outputs is in log domain while targets in linear
|
||||
targets = paddle.log(targets.cast(dtype='float32') + self.offset)
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
# limitations under the License.
|
||||
"""Length regulator related modules."""
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
@ -37,8 +35,10 @@ class LengthRegulator(nn.Layer):
|
|||
def __init__(self, pad_value=0.0):
|
||||
"""Initilize length regulator module.
|
||||
|
||||
Args:
|
||||
pad_value (float, optional): Value used for padding.
|
||||
Parameters
|
||||
----------
|
||||
pad_value : float, optional
|
||||
Value used for padding.
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
|
@ -68,14 +68,19 @@ class LengthRegulator(nn.Layer):
|
|||
def forward(self, xs, ds, alpha=1.0):
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
|
||||
ds (LongTensor): Batch of durations of each frame (B, T).
|
||||
alpha (float, optional): Alpha value to control speed of speech.
|
||||
|
||||
Returns:
|
||||
Tensor: replicated input tensor based on durations (B, T*, D).
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
|
||||
ds : LongTensor
|
||||
Batch of durations of each frame (B, T).
|
||||
alpha : float, optional
|
||||
Alpha value to control speed of speech.
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
replicated input tensor based on durations (B, T*, D).
|
||||
"""
|
||||
if alpha != 1.0:
|
||||
assert alpha > 0
|
||||
|
|
|
@ -43,15 +43,22 @@ class Postnet(nn.Layer):
|
|||
use_batch_norm=True, ):
|
||||
"""Initialize postnet module.
|
||||
|
||||
Args:
|
||||
idim (int): Dimension of the inputs.
|
||||
odim (int): Dimension of the outputs.
|
||||
n_layers (int, optional): The number of layers.
|
||||
n_filts (int, optional): The number of filter size.
|
||||
n_units (int, optional): The number of filter channels.
|
||||
use_batch_norm (bool, optional): Whether to use batch normalization..
|
||||
dropout_rate (float, optional): Dropout rate..
|
||||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Dimension of the inputs.
|
||||
odim : int
|
||||
Dimension of the outputs.
|
||||
n_layers : int, optional
|
||||
The number of layers.
|
||||
n_filts : int, optional
|
||||
The number of filter size.
|
||||
n_units : int, optional
|
||||
The number of filter channels.
|
||||
use_batch_norm : bool, optional
|
||||
Whether to use batch normalization..
|
||||
dropout_rate : float, optional
|
||||
Dropout rate..
|
||||
"""
|
||||
super(Postnet, self).__init__()
|
||||
self.postnet = nn.LayerList()
|
||||
|
@ -111,11 +118,15 @@ class Postnet(nn.Layer):
|
|||
def forward(self, xs):
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of the sequences of padded input tensors (B, idim, Tmax).
|
||||
|
||||
Returns:
|
||||
Tensor: Batch of padded output tensor. (B, odim, Tmax).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Batch of padded output tensor. (B, odim, Tmax).
|
||||
|
||||
"""
|
||||
for i in six.moves.range(len(self.postnet)):
|
||||
|
|
|
@ -15,10 +15,8 @@
|
|||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from parakeet.modules.layer_norm import LayerNorm
|
||||
from parakeet.modules.masked_fill import masked_fill
|
||||
|
||||
from typeguard import check_argument_types
|
||||
|
||||
|
||||
|
@ -43,13 +41,18 @@ class VariancePredictor(nn.Layer):
|
|||
dropout_rate: float=0.5, ):
|
||||
"""Initilize duration predictor module.
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
n_layers (int, optional): Number of convolutional layers.
|
||||
n_chans (int, optional): Number of channels of convolutional layers.
|
||||
kernel_size (int, optional): Kernel size of convolutional layers.
|
||||
dropout_rate (float, optional): Dropout rate.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
n_layers : int, optional
|
||||
Number of convolutional layers.
|
||||
n_chans : int, optional
|
||||
Number of channels of convolutional layers.
|
||||
kernel_size : int, optional
|
||||
Kernel size of convolutional layers.
|
||||
dropout_rate : float, optional
|
||||
Dropout rate.
|
||||
"""
|
||||
assert check_argument_types()
|
||||
super().__init__()
|
||||
|
@ -70,26 +73,30 @@ class VariancePredictor(nn.Layer):
|
|||
n_chans, dim=1),
|
||||
nn.Dropout(dropout_rate), ))
|
||||
|
||||
self.linear = nn.Linear(n_chans, 1)
|
||||
self.linear = nn.Linear(n_chans, 1, bias_attr=True)
|
||||
|
||||
def forward(self, xs: paddle.Tensor,
|
||||
x_masks: paddle.Tensor=None) -> paddle.Tensor:
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
||||
x_masks (ByteTensor, optional):
|
||||
Parameters
|
||||
----------
|
||||
xs : Tensor
|
||||
Batch of input sequences (B, Tmax, idim).
|
||||
x_masks : Tensor(bool), optional
|
||||
Batch of masks indicating padded part (B, Tmax, 1).
|
||||
|
||||
Returns:
|
||||
Tensor: Batch of predicted sequences (B, Tmax, 1).
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Batch of predicted sequences (B, Tmax, 1).
|
||||
"""
|
||||
# (B, idim, Tmax)
|
||||
xs = xs.transpose([0, 2, 1])
|
||||
# (B, C, Tmax)
|
||||
for f in self.conv:
|
||||
xs = f(xs) # (B, C, Tmax)
|
||||
# (B, C, Tmax)
|
||||
xs = f(xs)
|
||||
# (B, Tmax, 1)
|
||||
xs = self.linear(xs.transpose([0, 2, 1]))
|
||||
|
||||
|
|
|
@ -16,23 +16,22 @@
|
|||
import math
|
||||
|
||||
import numpy
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from paddle.fluid.layers import sequence_mask
|
||||
|
||||
from parakeet.modules.masked_fill import masked_fill
|
||||
|
||||
|
||||
class MultiHeadedAttention(nn.Layer):
|
||||
"""Multi-Head Attention layer.
|
||||
|
||||
Args:
|
||||
n_head (int): The number of heads.
|
||||
n_feat (int): The number of features.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_head : int
|
||||
The number of heads.
|
||||
n_feat : int
|
||||
The number of features.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
"""
|
||||
|
||||
def __init__(self, n_head, n_feat, dropout_rate):
|
||||
|
@ -42,33 +41,42 @@ class MultiHeadedAttention(nn.Layer):
|
|||
# We assume d_v always equals d_k
|
||||
self.d_k = n_feat // n_head
|
||||
self.h = n_head
|
||||
self.linear_q = nn.Linear(n_feat, n_feat)
|
||||
self.linear_k = nn.Linear(n_feat, n_feat)
|
||||
self.linear_v = nn.Linear(n_feat, n_feat)
|
||||
self.linear_out = nn.Linear(n_feat, n_feat)
|
||||
self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||
self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||
self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||
self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||
self.attn = None
|
||||
self.dropout = nn.Dropout(p=dropout_rate)
|
||||
|
||||
def forward_qkv(self, query, key, value):
|
||||
"""Transform query, key and value.
|
||||
|
||||
Args:
|
||||
query (paddle.Tensor): Query tensor (#batch, time1, size).
|
||||
key (paddle.Tensor): Key tensor (#batch, time2, size).
|
||||
value (paddle.Tensor): Value tensor (#batch, time2, size).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
|
||||
paddle.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
|
||||
paddle.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
|
||||
Parameters
|
||||
----------
|
||||
query : paddle.Tensor
|
||||
query tensor (#batch, time1, size).
|
||||
key : paddle.Tensor
|
||||
Key tensor (#batch, time2, size).
|
||||
value : paddle.Tensor
|
||||
Value tensor (#batch, time2, size).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Transformed query tensor (#batch, n_head, time1, d_k).
|
||||
paddle.Tensor
|
||||
Transformed key tensor (#batch, n_head, time2, d_k).
|
||||
paddle.Tensor
|
||||
Transformed value tensor (#batch, n_head, time2, d_k).
|
||||
"""
|
||||
n_batch = query.shape[0]
|
||||
|
||||
q = paddle.reshape(
|
||||
self.linear_q(query), [n_batch, -1, self.h, self.d_k])
|
||||
k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
|
||||
v = paddle.reshape(
|
||||
self.linear_v(value), [n_batch, -1, self.h, self.d_k])
|
||||
|
||||
# (batch, head, time1, d_k)
|
||||
q = q.transpose((0, 2, 1, 3))
|
||||
# (batch, head, time2, d_k)
|
||||
|
@ -80,44 +88,40 @@ class MultiHeadedAttention(nn.Layer):
|
|||
def forward_attention(self, value, scores, mask=None):
|
||||
"""Compute attention context vector.
|
||||
|
||||
Args:
|
||||
value (paddle.Tensor): Transformed value (#batch, n_head, time2, d_k).
|
||||
scores (paddle.Tensor): Attention score (#batch, n_head, time1, time2).
|
||||
mask (paddle.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
|
||||
Parameters
|
||||
----------
|
||||
value : paddle.Tensor
|
||||
Transformed value (#batch, n_head, time2, d_k).
|
||||
scores : paddle.Tensor
|
||||
Attention score (#batch, n_head, time1, time2).
|
||||
mask : paddle.Tensor
|
||||
Mask (#batch, 1, time2) or (#batch, time1, time2).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Transformed value (#batch, time1, d_model)
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor:
|
||||
Transformed value (#batch, time1, d_model)
|
||||
weighted by the attention score (#batch, time1, time2).
|
||||
|
||||
"""
|
||||
n_batch = value.shape[0]
|
||||
softmax = paddle.nn.Softmax(axis=-1)
|
||||
if mask is not None:
|
||||
|
||||
mask = mask.unsqueeze(1)
|
||||
# mask 取反, pad 的位置变成 true,之后 pad 的位置被替换为 0
|
||||
mask = paddle.logical_not(mask)
|
||||
|
||||
# mask = paddle.cast(mask, dtype='int64')
|
||||
# mask ==1 的位置用 min_value 代替
|
||||
# scores = scores.masked_fill(mask, min_value)
|
||||
min_value = float(
|
||||
numpy.finfo(
|
||||
paddle.to_tensor(
|
||||
0, dtype=scores.dtype).numpy().dtype).min)
|
||||
|
||||
scores = masked_fill(scores, mask, min_value)
|
||||
self.attn = softmax(scores) # (batch, head, time1, time2)
|
||||
|
||||
# 用value填充tensor中与mask中值为1位置相对应的元素 == 保留 mask 为0 的值
|
||||
# self.attn = torch.softmax(scores, dim=-1).masked_fill(
|
||||
# mask, 0.0
|
||||
# ) # (batch, head, time1, time2)
|
||||
# 保留 mask 为 0 的位置,其他变成 0
|
||||
# (batch, head, time1, time2)
|
||||
self.attn = softmax(scores)
|
||||
self.attn = masked_fill(self.attn, mask, 0.0)
|
||||
else:
|
||||
self.attn = softmax(scores) # (batch, head, time1, time2)
|
||||
# (batch, head, time1, time2)
|
||||
# (batch, head, time1, time2)
|
||||
self.attn = softmax(scores)
|
||||
# (batch, head, time1, time2)
|
||||
p_attn = self.dropout(self.attn)
|
||||
# (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
|
||||
x = paddle.matmul(p_attn, value)
|
||||
|
@ -130,16 +134,21 @@ class MultiHeadedAttention(nn.Layer):
|
|||
def forward(self, query, key, value, mask=None):
|
||||
"""Compute scaled dot product attention.
|
||||
|
||||
Args:
|
||||
query (paddle.Tensor): Query tensor (#batch, time1, size).
|
||||
key (paddle.Tensor): Key tensor (#batch, time2, size).
|
||||
value (paddle.Tensor): Value tensor (#batch, time2, size).
|
||||
mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
|
||||
(#batch, time1, time2).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Output tensor (#batch, time1, d_model).
|
||||
Parameters
|
||||
----------
|
||||
query : paddle.Tensor
|
||||
Query tensor (#batch, time1, size).
|
||||
key : paddle.Tensor
|
||||
Key tensor (#batch, time2, size).
|
||||
value : paddle.Tensor
|
||||
Value tensor (#batch, time2, size).
|
||||
mask : paddle.Tensor
|
||||
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time1, d_model).
|
||||
"""
|
||||
q, k, v = self.forward_qkv(query, key, value)
|
||||
scores = paddle.matmul(q, k.transpose(
|
||||
|
|
|
@ -22,14 +22,16 @@ from paddle import nn
|
|||
class PositionalEncoding(nn.Layer):
|
||||
"""Positional encoding.
|
||||
|
||||
Args:
|
||||
d_model (int): Embedding dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
max_len (int): Maximum input length.
|
||||
reverse (bool): Whether to reverse the input position. Only for
|
||||
the class LegacyRelPositionalEncoding. We remove it in the current
|
||||
class RelPositionalEncoding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d_model : int
|
||||
Embedding dimension.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
max_len : int
|
||||
Maximum input length.
|
||||
reverse : bool
|
||||
Whether to reverse the input position. Only for
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
|
||||
|
@ -47,7 +49,6 @@ class PositionalEncoding(nn.Layer):
|
|||
|
||||
pe = paddle.zeros([x.shape[1], self.d_model])
|
||||
if self.reverse:
|
||||
# (x.shape[1],1)
|
||||
position = paddle.arange(
|
||||
x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1)
|
||||
else:
|
||||
|
@ -65,12 +66,15 @@ class PositionalEncoding(nn.Layer):
|
|||
def forward(self, x: paddle.Tensor):
|
||||
"""Add positional encoding.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Input tensor (batch, time, `*`).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Encoded tensor (batch, time, `*`).
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Input tensor (batch, time, `*`).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Encoded tensor (batch, time, `*`).
|
||||
"""
|
||||
self.extend_pe(x)
|
||||
x = x * self.xscale + self.pe[:, :x.shape[1]]
|
||||
|
@ -82,11 +86,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
|
|||
|
||||
See Sec. 3.2 https://arxiv.org/abs/1809.08895
|
||||
|
||||
Args:
|
||||
d_model (int): Embedding dimension.
|
||||
dropout_rate (float): Dropout rate.
|
||||
max_len (int): Maximum input length.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d_model : int
|
||||
Embedding dimension.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
max_len : int
|
||||
Maximum input length.
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, dropout_rate, max_len=5000):
|
||||
|
@ -106,12 +113,15 @@ class ScaledPositionalEncoding(PositionalEncoding):
|
|||
def forward(self, x):
|
||||
"""Add positional encoding.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Input tensor (batch, time, `*`).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Encoded tensor (batch, time, `*`).
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Input tensor (batch, time, `*`).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Encoded tensor (batch, time, `*`).
|
||||
"""
|
||||
self.extend_pe(x)
|
||||
x = x + self.alpha * self.pe[:, :x.shape[1]]
|
||||
|
|
|
@ -12,19 +12,11 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import numpy
|
||||
import logging
|
||||
import paddle
|
||||
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
from paddle.fluid.layers import sequence_mask
|
||||
import sys
|
||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
||||
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
|
||||
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
|
||||
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
|
||||
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
|
||||
from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||
|
@ -35,28 +27,44 @@ from parakeet.modules.fastspeech2_transformer.repeat import repeat
|
|||
class Encoder(nn.Layer):
|
||||
"""Transformer encoder module.
|
||||
|
||||
Args:
|
||||
idim (int): Input dimension.
|
||||
attention_dim (int): Dimention of attention.
|
||||
attention_heads (int): The number of heads of multi head attention.
|
||||
linear_units (int): The number of units of position-wise feed forward.
|
||||
num_blocks (int): The number of decoder blocks.
|
||||
dropout_rate (float): Dropout rate.
|
||||
positional_dropout_rate (float): Dropout rate after adding positional encoding.
|
||||
attention_dropout_rate (float): Dropout rate in attention.
|
||||
input_layer (Union[str, paddle.nn.Layer]): Input layer type.
|
||||
pos_enc_class (paddle.nn.Layer): Positional encoding module class.
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
attention_dim : int
|
||||
Dimention of attention.
|
||||
attention_heads : int
|
||||
The number of heads of multi head attention.
|
||||
linear_units : int
|
||||
The number of units of position-wise feed forward.
|
||||
num_blocks : int
|
||||
The number of decoder blocks.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
positional_dropout_rate : float
|
||||
Dropout rate after adding positional encoding.
|
||||
attention_dropout_rate : float
|
||||
Dropout rate in attention.
|
||||
input_layer : Union[str, paddle.nn.Layer]
|
||||
Input layer type.
|
||||
pos_enc_class : paddle.nn.Layer
|
||||
Positional encoding module class.
|
||||
`PositionalEncoding `or `ScaledPositionalEncoding`
|
||||
normalize_before (bool): Whether to use layer_norm before the first block.
|
||||
concat_after (bool): Whether to concat attention layer's input and output.
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
|
||||
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
|
||||
selfattention_layer_type (str): Encoder attention layer type.
|
||||
padding_idx (int): Padding idx for input_layer=embed.
|
||||
|
||||
positionwise_layer_type : str
|
||||
"linear", "conv1d", or "conv1d-linear".
|
||||
positionwise_conv_kernel_size : int
|
||||
Kernel size of positionwise conv1d layer.
|
||||
selfattention_layer_type : str
|
||||
Encoder attention layer type.
|
||||
padding_idx : int
|
||||
Padding idx for input_layer=embed.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -82,7 +90,8 @@ class Encoder(nn.Layer):
|
|||
self.conv_subsampling_factor = 1
|
||||
if input_layer == "linear":
|
||||
self.embed = nn.Sequential(
|
||||
nn.Linear(idim, attention_dim),
|
||||
nn.Linear(
|
||||
idim, attention_dim, bias_attr=True),
|
||||
nn.LayerNorm(attention_dim),
|
||||
nn.Dropout(dropout_rate),
|
||||
nn.ReLU(),
|
||||
|
@ -169,14 +178,19 @@ class Encoder(nn.Layer):
|
|||
def forward(self, xs, masks):
|
||||
"""Encode input sequence.
|
||||
|
||||
Args:
|
||||
xs (paddle.Tensor): Input tensor (#batch, time, idim).
|
||||
masks (paddle.Tensor): Mask tensor (#batch, time).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Output tensor (#batch, time, attention_dim).
|
||||
paddle.Tensor: Mask tensor (#batch, time).
|
||||
Parameters
|
||||
----------
|
||||
xs : paddle.Tensor
|
||||
Input tensor (#batch, time, idim).
|
||||
masks : paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, attention_dim).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
"""
|
||||
xs = self.embed(xs)
|
||||
xs, masks = self.encoders(xs, masks)
|
||||
|
@ -187,16 +201,23 @@ class Encoder(nn.Layer):
|
|||
def forward_one_step(self, xs, masks, cache=None):
|
||||
"""Encode input frame.
|
||||
|
||||
Args:
|
||||
xs (paddle.Tensor): Input tensor.
|
||||
masks (paddle.Tensor): Mask tensor.
|
||||
cache (List[paddle.Tensor]): List of cache tensors.
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Output tensor.
|
||||
paddle.Tensor: Mask tensor.
|
||||
List[paddle.Tensor]: List of new cache tensors.
|
||||
Parameters
|
||||
----------
|
||||
xs : paddle.Tensor
|
||||
Input tensor.
|
||||
masks : paddle.Tensor
|
||||
Mask tensor.
|
||||
cache : List[paddle.Tensor]
|
||||
List of cache tensors.
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor.
|
||||
paddle.Tensor
|
||||
Mask tensor.
|
||||
List[paddle.Tensor]
|
||||
List of new cache tensors.
|
||||
"""
|
||||
|
||||
xs = self.embed(xs)
|
||||
|
|
|
@ -14,28 +14,31 @@
|
|||
"""Encoder self-attention layer definition."""
|
||||
|
||||
import paddle
|
||||
|
||||
from paddle import nn
|
||||
|
||||
|
||||
class EncoderLayer(nn.Layer):
|
||||
"""Encoder layer module.
|
||||
|
||||
Args:
|
||||
size (int): Input dimension.
|
||||
self_attn (paddle.nn.Layer): Self-attention module instance.
|
||||
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
|
||||
can be used as the argument.
|
||||
feed_forward (paddle.nn.Layer): Feed-forward module instance.
|
||||
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
|
||||
can be used as the argument.
|
||||
dropout_rate (float): Dropout rate.
|
||||
normalize_before (bool): Whether to use layer_norm before the first block.
|
||||
concat_after (bool): Whether to concat attention layer's input and output.
|
||||
Parameters
|
||||
----------
|
||||
size : int
|
||||
Input dimension.
|
||||
self_attn : paddle.nn.Layer
|
||||
Self-attention module instance.
|
||||
`MultiHeadedAttention` instance can be used as the argument.
|
||||
feed_forward : paddle.nn.Layer
|
||||
Feed-forward module instance.
|
||||
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -57,20 +60,26 @@ class EncoderLayer(nn.Layer):
|
|||
self.normalize_before = normalize_before
|
||||
self.concat_after = concat_after
|
||||
if self.concat_after:
|
||||
self.concat_linear = nn.Linear(size + size, size)
|
||||
self.concat_linear = nn.Linear(size + size, size, bias_attr=True)
|
||||
|
||||
def forward(self, x, mask, cache=None):
|
||||
"""Compute encoded features.
|
||||
|
||||
Args:
|
||||
x_input (paddle.Tensor): Input tensor (#batch, time, size).
|
||||
mask (paddle.Tensor): Mask tensor for the input (#batch, time).
|
||||
cache (paddle.Tensor): Cache tensor of the input (#batch, time - 1, size).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Output tensor (#batch, time, size).
|
||||
paddle.Tensor: Mask tensor (#batch, time).
|
||||
Parameters
|
||||
----------
|
||||
x_input : paddle.Tensor
|
||||
Input tensor (#batch, time, size).
|
||||
mask : paddle.Tensor
|
||||
Mask tensor for the input (#batch, time).
|
||||
cache : paddle.Tensor
|
||||
Cache tensor of the input (#batch, time - 1, size).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, size).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
"""
|
||||
residual = x
|
||||
if self.normalize_before:
|
||||
|
@ -82,7 +91,6 @@ class EncoderLayer(nn.Layer):
|
|||
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
|
||||
x_q = x[:, -1:, :]
|
||||
residual = residual[:, -1:, :]
|
||||
# non-pad mask 变成 pad mask
|
||||
mask = None if mask is None else mask[:, -1:, :]
|
||||
|
||||
if self.concat_after:
|
||||
|
@ -90,6 +98,7 @@ class EncoderLayer(nn.Layer):
|
|||
(x, self.self_attn(x_q, x, x, mask)), axis=-1)
|
||||
x = residual + self.concat_linear(x_concat)
|
||||
else:
|
||||
|
||||
x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
|
||||
if not self.normalize_before:
|
||||
x = self.norm1(x)
|
||||
|
|
|
@ -32,11 +32,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
|
|||
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
|
||||
"""Initialize MultiLayeredConv1d module.
|
||||
|
||||
Args:
|
||||
in_chans (int): Number of input channels.
|
||||
hidden_chans (int): Number of hidden channels.
|
||||
kernel_size (int): Kernel size of conv1d.
|
||||
dropout_rate (float): Dropout rate.
|
||||
Parameters
|
||||
----------
|
||||
in_chans : int
|
||||
Number of input channels.
|
||||
hidden_chans : int
|
||||
Number of hidden channels.
|
||||
kernel_size : int
|
||||
Kernel size of conv1d.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
|
||||
"""
|
||||
super(MultiLayeredConv1d, self).__init__()
|
||||
|
@ -58,14 +63,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
|
|||
def forward(self, x):
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Batch of output tensors (B, T, in_chans).
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Batch of input tensors (B, T, in_chans).
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Batch of output tensors (B, T, in_chans).
|
||||
"""
|
||||
# x = paddle.nn.ReLU(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
|
||||
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
||||
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
|
||||
[0, 2, 1])
|
||||
|
@ -81,12 +88,16 @@ class Conv1dLinear(paddle.nn.Layer):
|
|||
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
|
||||
"""Initialize Conv1dLinear module.
|
||||
|
||||
Args:
|
||||
in_chans (int): Number of input channels.
|
||||
hidden_chans (int): Number of hidden channels.
|
||||
kernel_size (int): Kernel size of conv1d.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
in_chans : int
|
||||
Number of input channels.
|
||||
hidden_chans : int
|
||||
Number of hidden channels.
|
||||
kernel_size : int
|
||||
Kernel size of conv1d.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
"""
|
||||
super(Conv1dLinear, self).__init__()
|
||||
self.w_1 = paddle.nn.Conv1D(
|
||||
|
@ -95,18 +106,22 @@ class Conv1dLinear(paddle.nn.Layer):
|
|||
kernel_size,
|
||||
stride=1,
|
||||
padding=(kernel_size - 1) // 2, )
|
||||
self.w_2 = paddle.nn.Linear(hidden_chans, in_chans)
|
||||
self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
|
||||
self.dropout = paddle.nn.Dropout(dropout_rate)
|
||||
self.relu = paddle.nn.ReLU()
|
||||
|
||||
def forward(self, x):
|
||||
"""Calculate forward propagation.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Batch of input tensors (B, T, in_chans).
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Batch of output tensors (B, T, in_chans).
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Batch of output tensors (B, T, in_chans).
|
||||
|
||||
"""
|
||||
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
||||
|
|
|
@ -19,11 +19,14 @@ import paddle
|
|||
class PositionwiseFeedForward(paddle.nn.Layer):
|
||||
"""Positionwise feed forward layer.
|
||||
|
||||
Args:
|
||||
idim (int): Input dimenstion.
|
||||
hidden_units (int): The number of hidden units.
|
||||
dropout_rate (float): Dropout rate.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimenstion.
|
||||
hidden_units : int
|
||||
The number of hidden units.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -33,8 +36,8 @@ class PositionwiseFeedForward(paddle.nn.Layer):
|
|||
activation=paddle.nn.ReLU()):
|
||||
"""Construct an PositionwiseFeedForward object."""
|
||||
super(PositionwiseFeedForward, self).__init__()
|
||||
self.w_1 = paddle.nn.Linear(idim, hidden_units)
|
||||
self.w_2 = paddle.nn.Linear(hidden_units, idim)
|
||||
self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
|
||||
self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
|
||||
self.dropout = paddle.nn.Dropout(dropout_rate)
|
||||
self.activation = activation
|
||||
|
||||
|
|
|
@ -29,12 +29,16 @@ class MultiSequential(paddle.nn.Sequential):
|
|||
def repeat(N, fn):
|
||||
"""Repeat module N times.
|
||||
|
||||
Args:
|
||||
N (int): Number of repeat time.
|
||||
fn (Callable): Function to generate module.
|
||||
|
||||
Returns:
|
||||
MultiSequential: Repeated model instance.
|
||||
Parameters
|
||||
----------
|
||||
N : int
|
||||
Number of repeat time.
|
||||
fn : Callable
|
||||
Function to generate module.
|
||||
|
||||
Returns
|
||||
----------
|
||||
MultiSequential
|
||||
Repeated model instance.
|
||||
"""
|
||||
return MultiSequential(* [fn(n) for n in range(N)])
|
||||
|
|
|
@ -19,10 +19,12 @@ import paddle
|
|||
class LayerNorm(paddle.nn.LayerNorm):
|
||||
"""Layer normalization module.
|
||||
|
||||
Args:
|
||||
nout (int): Output dim size.
|
||||
dim (int): Dimension to be normalized.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nout : int
|
||||
Output dim size.
|
||||
dim : int
|
||||
Dimension to be normalized.
|
||||
"""
|
||||
|
||||
def __init__(self, nout, dim=-1):
|
||||
|
@ -33,12 +35,15 @@ class LayerNorm(paddle.nn.LayerNorm):
|
|||
def forward(self, x):
|
||||
"""Apply layer normalization.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Normalized tensor.
|
||||
Parameters
|
||||
----------
|
||||
x : paddle.Tensor
|
||||
Input tensor.
|
||||
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Normalized tensor.
|
||||
"""
|
||||
if self.dim == -1:
|
||||
return super(LayerNorm, self).forward(x)
|
||||
|
|
|
@ -28,7 +28,7 @@ def is_broadcastable(shp1, shp2):
|
|||
def masked_fill(xs: paddle.Tensor,
|
||||
mask: paddle.Tensor,
|
||||
value: Union[float, int]):
|
||||
# assert is_broadcastable(xs.shape, mask.shape) is True
|
||||
assert is_broadcastable(xs.shape, mask.shape) is True
|
||||
bshape = paddle.broadcast_shape(xs.shape, mask.shape)
|
||||
mask = mask.broadcast_to(bshape)
|
||||
trues = paddle.ones_like(xs) * value
|
||||
|
|
|
@ -13,20 +13,27 @@
|
|||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from typeguard import check_argument_types
|
||||
|
||||
|
||||
# 按照这个 batch 里面最长的补零
|
||||
def pad_list(xs, pad_value):
|
||||
"""Perform padding for the list of tensors.
|
||||
|
||||
Args:
|
||||
xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
||||
pad_value (float): Value for padding.
|
||||
Parameters
|
||||
----------
|
||||
xs : List[Tensor]
|
||||
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
||||
pad_value : float)
|
||||
Value for padding.
|
||||
|
||||
Returns:
|
||||
Tensor: Padded tensor (B, Tmax, `*`).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Padded tensor (B, Tmax, `*`).
|
||||
|
||||
Examples:
|
||||
Examples
|
||||
----------
|
||||
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
|
||||
>>> x
|
||||
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
|
||||
|
@ -34,11 +41,9 @@ def pad_list(xs, pad_value):
|
|||
tensor([[1., 1., 1., 1.],
|
||||
[1., 1., 0., 0.],
|
||||
[1., 0., 0., 0.]])
|
||||
|
||||
"""
|
||||
n_batch = len(xs)
|
||||
max_len = max(x.shape[0] for x in xs)
|
||||
# pad = xs[0].new(n_batch, max_len, *xs[0].shape[1:]).fill_(pad_value)
|
||||
pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)
|
||||
|
||||
for i in range(n_batch):
|
||||
|
@ -50,13 +55,18 @@ def pad_list(xs, pad_value):
|
|||
def make_pad_mask(lengths, length_dim=-1):
|
||||
"""Make mask tensor containing indices of padded part.
|
||||
|
||||
Args:
|
||||
lengths (LongTensor or List): Batch of lengths (B,).
|
||||
Parameters
|
||||
----------
|
||||
lengths : LongTensor or List
|
||||
Batch of lengths (B,).
|
||||
|
||||
Returns:
|
||||
Tensor: Mask tensor containing indices of padded part bool.
|
||||
Returns
|
||||
----------
|
||||
Tensor(bool)
|
||||
Mask tensor containing indices of padded part bool.
|
||||
|
||||
Examples:
|
||||
Examples
|
||||
----------
|
||||
With only lengths.
|
||||
|
||||
>>> lengths = [5, 3, 2]
|
||||
|
@ -64,7 +74,6 @@ def make_pad_mask(lengths, length_dim=-1):
|
|||
masks = [[0, 0, 0, 0 ,0],
|
||||
[0, 0, 0, 1, 1],
|
||||
[0, 0, 1, 1, 1]]
|
||||
|
||||
"""
|
||||
if length_dim == 0:
|
||||
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
|
||||
|
@ -88,17 +97,24 @@ def make_pad_mask(lengths, length_dim=-1):
|
|||
def make_non_pad_mask(lengths, length_dim=-1):
|
||||
"""Make mask tensor containing indices of non-padded part.
|
||||
|
||||
Args:
|
||||
lengths (LongTensor or List): Batch of lengths (B,).
|
||||
xs (Tensor, optional): The reference tensor.
|
||||
Parameters
|
||||
----------
|
||||
lengths : LongTensor or List
|
||||
Batch of lengths (B,).
|
||||
xs : Tensor, optional
|
||||
The reference tensor.
|
||||
If set, masks will be the same shape as this tensor.
|
||||
length_dim (int, optional): Dimension indicator of the above tensor.
|
||||
length_dim : int, optional
|
||||
Dimension indicator of the above tensor.
|
||||
See the example.
|
||||
|
||||
Returns:
|
||||
ByteTensor: mask tensor containing indices of padded part bool.
|
||||
Returns
|
||||
----------
|
||||
Tensor(bool)
|
||||
mask tensor containing indices of padded part bool.
|
||||
|
||||
Examples:
|
||||
Examples
|
||||
----------
|
||||
With only lengths.
|
||||
|
||||
>>> lengths = [5, 3, 2]
|
||||
|
@ -106,6 +122,37 @@ def make_non_pad_mask(lengths, length_dim=-1):
|
|||
masks = [[1, 1, 1, 1 ,1],
|
||||
[1, 1, 1, 0, 0],
|
||||
[1, 1, 0, 0, 0]]
|
||||
|
||||
"""
|
||||
return paddle.logical_not(make_pad_mask(lengths, length_dim))
|
||||
|
||||
|
||||
def initialize(model: nn.Layer, init: str):
|
||||
"""Initialize weights of a neural network module.
|
||||
|
||||
Parameters are initialized using the given method or distribution.
|
||||
|
||||
Custom initialization routines can be implemented into submodules
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : paddle.nn.Layer
|
||||
Target.
|
||||
init : str
|
||||
Method of initialization.
|
||||
"""
|
||||
assert check_argument_types()
|
||||
|
||||
if init == "xavier_uniform":
|
||||
nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
|
||||
nn.initializer.Constant())
|
||||
elif init == "xavier_normal":
|
||||
nn.initializer.set_global_initializer(nn.initializer.XavierNormal(),
|
||||
nn.initializer.Constant())
|
||||
elif init == "kaiming_uniform":
|
||||
nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(),
|
||||
nn.initializer.Constant())
|
||||
elif init == "kaiming_normal":
|
||||
nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(),
|
||||
nn.initializer.Constant())
|
||||
else:
|
||||
raise ValueError("Unknown initialization: " + init)
|
||||
|
|
Loading…
Reference in New Issue