format docstrings

This commit is contained in:
TianYuan 2021-07-13 07:55:56 +00:00
parent 3af3c29a94
commit 6553d1d723
15 changed files with 597 additions and 371 deletions

View File

@ -12,28 +12,26 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Fastspeech2 related modules for paddle""" """Fastspeech2 related modules for paddle"""
import logging
import numpy as np
from typing import Dict from typing import Dict
from typing import Sequence from typing import Sequence
from typing import Tuple from typing import Tuple
from typeguard import check_argument_types from typeguard import check_argument_types
import paddle import paddle
import numpy as np
from paddle import nn from paddle import nn
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
from parakeet.modules.fastspeech2_predictor.postnet import Postnet from parakeet.modules.fastspeech2_predictor.postnet import Postnet
from parakeet.modules.nets_utils import make_non_pad_mask from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
from parakeet.modules.nets_utils import make_pad_mask
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
from parakeet.modules.nets_utils import initialize
from parakeet.modules.nets_utils import make_non_pad_mask
from parakeet.modules.nets_utils import make_pad_mask
class FastSpeech2(nn.Layer): class FastSpeech2(nn.Layer):
@ -155,7 +153,6 @@ class FastSpeech2(nn.Layer):
positionwise_layer_type=positionwise_layer_type, positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
else: else:
print("encoder_type:", encoder_type)
raise ValueError(f"{encoder_type} is not supported.") raise ValueError(f"{encoder_type} is not supported.")
# define duration predictor # define duration predictor
@ -236,6 +233,12 @@ class FastSpeech2(nn.Layer):
use_batch_norm=use_batch_norm, use_batch_norm=use_batch_norm,
dropout_rate=postnet_dropout_rate, )) dropout_rate=postnet_dropout_rate, ))
# initialize parameters
self._reset_parameters(
init_type=init_type,
init_enc_alpha=init_enc_alpha,
init_dec_alpha=init_dec_alpha, )
# define criterions # define criterions
self.criterion = FastSpeech2Loss( self.criterion = FastSpeech2Loss(
use_masking=use_masking, use_weighted_masking=use_weighted_masking) use_masking=use_masking, use_weighted_masking=use_weighted_masking)
@ -253,25 +256,37 @@ class FastSpeech2(nn.Layer):
energy: paddle.Tensor, energy: paddle.Tensor,
energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[ energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[
str, paddle.Tensor], paddle.Tensor]: str, paddle.Tensor], paddle.Tensor]:
# """Calculate forward propagation. """Calculate forward propagation.
# Args: Parameters
# text (LongTensor): Batch of padded token ids (B, Tmax). ----------
# text_lengths (LongTensor): Batch of lengths of each input (B,). text : LongTensor
# speech (Tensor): Batch of padded target features (B, Lmax, odim). Batch of padded token ids (B, Tmax).
# speech_lengths (LongTensor): Batch of the lengths of each target (B,). text_lengths : LongTensor)
# durations (LongTensor): Batch of padded durations (B, Tmax + 1). Batch of lengths of each input (B,).
# durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1). speech : Tensor
# pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1). Batch of padded target features (B, Lmax, odim).
# pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1). speech_lengths : LongTensor
# energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1). Batch of the lengths of each target (B,).
# energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1). durations : LongTensor
# Returns: Batch of padded durations (B, Tmax + 1).
# Tensor: Loss scalar value. durations_lengths : LongTensor
# Dict: Statistics to be monitored. Batch of duration lengths (B, Tmax + 1).
# Tensor: Weight value. pitch : Tensor
Batch of padded token-averaged pitch (B, Tmax + 1, 1).
# """ pitch_lengths : LongTensor
Batch of pitch lengths (B, Tmax + 1).
energy : Tensor
Batch of padded token-averaged energy (B, Tmax + 1, 1).
energy_lengths : LongTensor
Batch of energy lengths (B, Tmax + 1).
Returns
----------
Tensor
Loss scalar value.
Dict
Statistics to be monitored.
"""
text = text[:, :text_lengths.max()] # for data-parallel text = text[:, :text_lengths.max()] # for data-parallel
speech = speech[:, :speech_lengths.max()] # for data-parallel speech = speech[:, :speech_lengths.max()] # for data-parallel
durations = durations[:, :durations_lengths.max()] # for data-parallel durations = durations[:, :durations_lengths.max()] # for data-parallel
@ -282,16 +297,11 @@ class FastSpeech2(nn.Layer):
# Add eos at the last of sequence # Add eos at the last of sequence
# xs = F.pad(text, [0, 1], "constant", self.padding_idx) # xs = F.pad(text, [0, 1], "constant", self.padding_idx)
print("xs.shape in fastspeech2.py before:", text.shape, text)
xs = np.pad(text.numpy(), xs = np.pad(text.numpy(),
pad_width=((0, 0), (0, 1)), pad_width=((0, 0), (0, 1)),
mode="constant", mode="constant",
constant_values=self.padding_idx) constant_values=self.padding_idx)
xs = paddle.to_tensor(xs) xs = paddle.to_tensor(xs)
print("xs.shape in fastspeech2.py end:", xs.shape, xs)
# my_pad = nn.Pad1D(padding=[0, 1], mode="constant", value=self.padding_idx)
# xs = my_pad(text)
# 是否会数组越界? xs 是否能取到 l -> 可以,因为上一步补充了一个 padding_idx又变成了 eos
for i, l in enumerate(text_lengths): for i, l in enumerate(text_lengths):
xs[i, l] = self.eos xs[i, l] = self.eos
ilens = text_lengths + 1 ilens = text_lengths + 1
@ -302,23 +312,16 @@ class FastSpeech2(nn.Layer):
# forward propagation # forward propagation
before_outs, after_outs, d_outs, p_outs, e_outs = self._forward( before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
xs, ilens, ys, olens, ds, ps, es, is_inference=False) xs, ilens, ys, olens, ds, ps, es, is_inference=False)
print("d_outs in paddle:", d_outs)
print("p_outs in paddle:", p_outs)
print("e_outs in paddle:", e_outs)
# modify mod part of groundtruth # modify mod part of groundtruth
if self.reduction_factor > 1: if self.reduction_factor > 1:
# 需要改
olens = paddle.to_tensor([ olens = paddle.to_tensor([
olen - olen % self.reduction_factor for olen in olens.numpy() olen - olen % self.reduction_factor for olen in olens.numpy()
]) ])
max_olen = max(olens) max_olen = max(olens)
ys = ys[:, :max_olen] ys = ys[:, :max_olen]
# calculate loss # calculate loss
if self.postnet is None: if self.postnet is None:
after_outs = None after_outs = None
# calculate loss # calculate loss
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs, after_outs=after_outs,
@ -363,9 +366,8 @@ class FastSpeech2(nn.Layer):
alpha: float=1.0, ) -> Sequence[paddle.Tensor]: alpha: float=1.0, ) -> Sequence[paddle.Tensor]:
# forward encoder # forward encoder
x_masks = self._source_mask(ilens) x_masks = self._source_mask(ilens)
print("xs.shape in fastspeech2.py:", xs.shape)
hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim)
hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim)
# forward duration predictor and variance predictors # forward duration predictor and variance predictors
d_masks = make_pad_mask(ilens) d_masks = make_pad_mask(ilens)
@ -377,10 +379,11 @@ class FastSpeech2(nn.Layer):
e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1)) e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
else: else:
e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1)) e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
print("p_outs.shape:", p_outs.shape)
if is_inference: if is_inference:
d_outs = self.duration_predictor.inference(hs, d_outs = self.duration_predictor.inference(hs,
d_masks) # (B, Tmax) d_masks) # (B, Tmax)
# print("d_outs:",d_outs)
# use prediction in inference # use prediction in inference
# (B, Tmax, 1) # (B, Tmax, 1)
@ -404,7 +407,6 @@ class FastSpeech2(nn.Layer):
# forward decoder # forward decoder
if olens is not None and not is_inference: if olens is not None and not is_inference:
if self.reduction_factor > 1: if self.reduction_factor > 1:
# 直接to_paddle ,维度会增加 1,需要先转成 numpy
olens_in = paddle.to_tensor( olens_in = paddle.to_tensor(
[olen // self.reduction_factor for olen in olens.numpy()]) [olen // self.reduction_factor for olen in olens.numpy()])
else: else:
@ -412,9 +414,10 @@ class FastSpeech2(nn.Layer):
h_masks = self._source_mask(olens_in) h_masks = self._source_mask(olens_in)
else: else:
h_masks = None h_masks = None
zs, _ = self.decoder(hs, h_masks) # (B, Lmax, adim) # (B, Lmax, adim)
before_outs = self.feat_out(zs).reshape( zs, _ = self.decoder(hs, h_masks)
(zs.shape[0], -1, self.odim)) # (B, Lmax, odim) # (B, Lmax, odim)
before_outs = self.feat_out(zs).reshape((zs.shape[0], -1, self.odim))
# postnet -> (B, Lmax//r * r, odim) # postnet -> (B, Lmax//r * r, odim)
if self.postnet is None: if self.postnet is None:
@ -437,20 +440,30 @@ class FastSpeech2(nn.Layer):
paddle.Tensor, paddle.Tensor, paddle.Tensor]: paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters. """Generate the sequence of features given the sequences of characters.
Args: Parameters
text (LongTensor): Input sequence of characters (T,). ----------
speech (Tensor, optional): Feature sequence to extract style (N, idim). text : LongTensor
durations (LongTensor, optional): Groundtruth of duration (T + 1,). Input sequence of characters (T,).
pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1). speech : Tensor, optional
energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1). Feature sequence to extract style (N, idim).
alpha (float, optional): Alpha to control the speed. durations : LongTensor, optional
use_teacher_forcing (bool, optional): Whether to use teacher forcing. Groundtruth of duration (T + 1,).
If true, groundtruth of duration, pitch and energy will be used. pitch : Tensor, optional
Groundtruth of token-averaged pitch (T + 1, 1).
energy : Tensor, optional
Groundtruth of token-averaged energy (T + 1, 1).
alpha : float, optional
Alpha to control the speed.
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
Returns: Returns
Tensor: Output sequence of features (L, odim). ----------
None: Dummy for compatibility. Tensor
None: Dummy for compatibility. Output sequence of features (L, odim).
None
Dummy for compatibility.
""" """
x, y = text, speech x, y = text, speech
@ -460,13 +473,15 @@ class FastSpeech2(nn.Layer):
x = np.pad(text.numpy(), x = np.pad(text.numpy(),
pad_width=((0, 1)), pad_width=((0, 1)),
mode="constant", mode="constant",
constant_values=self.padding_idx) constant_values=self.eos)
x = paddle.to_tensor(x) x = paddle.to_tensor(x)
# setup batch axis # setup batch axis
ilens = paddle.to_tensor( ilens = paddle.to_tensor(
[x.shape[0]], dtype=paddle.int64, place=x.place) [x.shape[0]], dtype=paddle.int64, place=x.place)
xs, ys = x.unsqueeze(0), None xs, ys = x.unsqueeze(0), None
if y is not None: if y is not None:
ys = y.unsqueeze(0) ys = y.unsqueeze(0)
@ -493,14 +508,19 @@ class FastSpeech2(nn.Layer):
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for self-attention. """Make masks for self-attention.
Args: Parameters
ilens (LongTensor): Batch of lengths (B,). ----------
ilens : LongTensor
Batch of lengths (B,).
Returns: Returns
Tensor: Mask tensor for self-attention. -------
Tensor
Mask tensor for self-attention.
dtype=paddle.bool dtype=paddle.bool
Examples: Examples
-------
>>> ilens = [5, 3] >>> ilens = [5, 3]
>>> self._source_mask(ilens) >>> self._source_mask(ilens)
tensor([[[1, 1, 1, 1, 1], tensor([[[1, 1, 1, 1, 1],
@ -510,6 +530,29 @@ class FastSpeech2(nn.Layer):
x_masks = make_non_pad_mask(ilens) x_masks = make_non_pad_mask(ilens)
return x_masks.unsqueeze(-2) return x_masks.unsqueeze(-2)
def _reset_parameters(self,
init_type: str,
init_enc_alpha: float,
init_dec_alpha: float):
# initialize parameters
initialize(self, init_type)
# initialize alpha in scaled positional encoding
if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
init_enc_alpha = paddle.to_tensor(init_enc_alpha)
self.encoder.embed[-1].alpha = paddle.create_parameter(
shape=init_enc_alpha.shape,
dtype=str(init_enc_alpha.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(
init_enc_alpha))
if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
init_dec_alpha = paddle.to_tensor(init_dec_alpha)
self.decoder.embed[-1].alpha = paddle.create_parameter(
shape=init_dec_alpha.shape,
dtype=str(init_dec_alpha.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(
init_dec_alpha))
class FastSpeech2Loss(nn.Layer): class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2.""" """Loss function module for FastSpeech2."""
@ -519,12 +562,12 @@ class FastSpeech2Loss(nn.Layer):
use_weighted_masking: bool=False): use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module. """Initialize feed-forward Transformer loss module.
Args: Parameters
use_masking (bool): ----------
use_masking : bool
Whether to apply masking for padded part in loss calculation. Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool): use_weighted_masking : bool
Whether to weighted masking in loss calculation. Whether to weighted masking in loss calculation.
""" """
assert check_argument_types() assert check_argument_types()
super().__init__() super().__init__()
@ -555,24 +598,41 @@ class FastSpeech2Loss(nn.Layer):
paddle.Tensor, paddle.Tensor]: paddle.Tensor, paddle.Tensor]:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim). ----------
before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim). after_outs : Tensor
d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax). Batch of outputs after postnets (B, Lmax, odim).
p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). before_outs : Tensor
e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1). Batch of outputs before postnets (B, Lmax, odim).
ys (Tensor): Batch of target features (B, Lmax, odim). d_outs : LongTensor
ds (LongTensor): Batch of durations (B, Tmax). Batch of outputs of duration predictor (B, Tmax).
ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1). p_outs : Tensor
es (Tensor): Batch of target token-averaged energy (B, Tmax, 1). Batch of outputs of pitch predictor (B, Tmax, 1).
ilens (LongTensor): Batch of the lengths of each input (B,). e_outs : Tensor
olens (LongTensor): Batch of the lengths of each target (B,). Batch of outputs of energy predictor (B, Tmax, 1).
ys : Tensor
Batch of target features (B, Lmax, odim).
ds : LongTensor
Batch of durations (B, Tmax).
ps : Tensor
Batch of target token-averaged pitch (B, Tmax, 1).
es : Tensor
Batch of target token-averaged energy (B, Tmax, 1).
ilens : LongTensor
Batch of the lengths of each input (B,).
olens : LongTensor
Batch of the lengths of each target (B,).
Returns: Returns
Tensor: L1 loss value. ----------
Tensor: Duration predictor loss value. Tensor
Tensor: Pitch predictor loss value. L1 loss value.
Tensor: Energy predictor loss value. Tensor
Duration predictor loss value.
Tensor
Pitch predictor loss value.
Tensor
Energy predictor loss value.
""" """
# apply mask to remove padded part # apply mask to remove padded part

View File

@ -15,7 +15,6 @@
import paddle import paddle
from paddle import nn from paddle import nn
from parakeet.modules.layer_norm import LayerNorm from parakeet.modules.layer_norm import LayerNorm
from parakeet.modules.masked_fill import masked_fill from parakeet.modules.masked_fill import masked_fill
@ -31,7 +30,8 @@ class DurationPredictor(nn.Layer):
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
https://arxiv.org/pdf/1905.09263.pdf https://arxiv.org/pdf/1905.09263.pdf
Note: Note
----------
The calculation domain of outputs is different The calculation domain of outputs is different
between in `forward` and in `inference`. In `forward`, between in `forward` and in `inference`. In `forward`,
the outputs are calculated in log domain but in `inference`, the outputs are calculated in log domain but in `inference`,
@ -48,13 +48,20 @@ class DurationPredictor(nn.Layer):
offset=1.0): offset=1.0):
"""Initilize duration predictor module. """Initilize duration predictor module.
Args: Parameters
idim (int): Input dimension. ----------
n_layers (int, optional): Number of convolutional layers. idim : int
n_chans (int, optional): Number of channels of convolutional layers. Input dimension.
kernel_size (int, optional): Kernel size of convolutional layers. n_layers : int, optional
dropout_rate (float, optional): Dropout rate. Number of convolutional layers.
offset (float, optional): Offset value to avoid nan in log domain. n_chans : int, optional
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
offset : float, optional
Offset value to avoid nan in log domain.
""" """
super(DurationPredictor, self).__init__() super(DurationPredictor, self).__init__()
@ -74,7 +81,7 @@ class DurationPredictor(nn.Layer):
LayerNorm( LayerNorm(
n_chans, dim=1), n_chans, dim=1),
nn.Dropout(dropout_rate), )) nn.Dropout(dropout_rate), ))
self.linear = nn.Linear(n_chans, 1) self.linear = nn.Linear(n_chans, 1, bias_attr=True)
def _forward(self, xs, x_masks=None, is_inference=False): def _forward(self, xs, x_masks=None, is_inference=False):
# (B, idim, Tmax) # (B, idim, Tmax)
@ -83,7 +90,7 @@ class DurationPredictor(nn.Layer):
for f in self.conv: for f in self.conv:
xs = f(xs) xs = f(xs)
# NOTE: calculate in log domain # NOTE: calculate in log domain
# (B, Tmax) # (B, Tmax)
xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1) xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1)
@ -99,28 +106,34 @@ class DurationPredictor(nn.Layer):
def forward(self, xs, x_masks=None): def forward(self, xs, x_masks=None):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
xs (Tensor): Batch of input sequences (B, Tmax, idim). ----------
x_masks (ByteTensor, optional): xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : ByteTensor, optional
Batch of masks indicating padded part (B, Tmax). Batch of masks indicating padded part (B, Tmax).
Returns: Returns
Tensor: Batch of predicted durations in log domain (B, Tmax). ----------
Tensor
Batch of predicted durations in log domain (B, Tmax).
""" """
return self._forward(xs, x_masks, False) return self._forward(xs, x_masks, False)
def inference(self, xs, x_masks=None): def inference(self, xs, x_masks=None):
"""Inference duration. """Inference duration.
Args: Parameters
xs (Tensor): Batch of input sequences (B, Tmax, idim). ----------
x_masks (ByteTensor, optional): xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : Tensor(bool), optional
Batch of masks indicating padded part (B, Tmax). Batch of masks indicating padded part (B, Tmax).
Returns: Returns
LongTensor: Batch of predicted durations in linear domain int64 (B, Tmax). ----------
LongTensor
Batch of predicted durations in linear domain int64 (B, Tmax).
""" """
return self._forward(xs, x_masks, True) return self._forward(xs, x_masks, True)
@ -135,10 +148,12 @@ class DurationPredictorLoss(nn.Layer):
def __init__(self, offset=1.0, reduction="mean"): def __init__(self, offset=1.0, reduction="mean"):
"""Initilize duration predictor loss module. """Initilize duration predictor loss module.
Args: Parameters
offset (float, optional): Offset value to avoid nan in log domain. ----------
reduction (str): Reduction type in loss calculation. offset : float, optional
Offset value to avoid nan in log domain.
reduction : str
Reduction type in loss calculation.
""" """
super(DurationPredictorLoss, self).__init__() super(DurationPredictorLoss, self).__init__()
self.criterion = nn.MSELoss(reduction=reduction) self.criterion = nn.MSELoss(reduction=reduction)
@ -147,16 +162,21 @@ class DurationPredictorLoss(nn.Layer):
def forward(self, outputs, targets): def forward(self, outputs, targets):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
outputs (Tensor): Batch of prediction durations in log domain (B, T) ----------
targets (LongTensor): Batch of groundtruth durations in linear domain (B, T) outputs : Tensor
Batch of prediction durations in log domain (B, T)
targets : LongTensor
Batch of groundtruth durations in linear domain (B, T)
Returns: Returns
Tensor: Mean squared error loss value. ----------
Tensor
Mean squared error loss value.
Note: Note
----------
`outputs` is in log domain but `targets` is in linear domain. `outputs` is in log domain but `targets` is in linear domain.
""" """
# NOTE: outputs is in log domain while targets in linear # NOTE: outputs is in log domain while targets in linear
targets = paddle.log(targets.cast(dtype='float32') + self.offset) targets = paddle.log(targets.cast(dtype='float32') + self.offset)

View File

@ -13,8 +13,6 @@
# limitations under the License. # limitations under the License.
"""Length regulator related modules.""" """Length regulator related modules."""
import logging
import numpy as np import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
@ -37,8 +35,10 @@ class LengthRegulator(nn.Layer):
def __init__(self, pad_value=0.0): def __init__(self, pad_value=0.0):
"""Initilize length regulator module. """Initilize length regulator module.
Args: Parameters
pad_value (float, optional): Value used for padding. ----------
pad_value : float, optional
Value used for padding.
""" """
super().__init__() super().__init__()
@ -68,14 +68,19 @@ class LengthRegulator(nn.Layer):
def forward(self, xs, ds, alpha=1.0): def forward(self, xs, ds, alpha=1.0):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). ----------
ds (LongTensor): Batch of durations of each frame (B, T). xs : Tensor
alpha (float, optional): Alpha value to control speed of speech. Batch of sequences of char or phoneme embeddings (B, Tmax, D).
ds : LongTensor
Returns: Batch of durations of each frame (B, T).
Tensor: replicated input tensor based on durations (B, T*, D). alpha : float, optional
Alpha value to control speed of speech.
Returns
----------
Tensor
replicated input tensor based on durations (B, T*, D).
""" """
if alpha != 1.0: if alpha != 1.0:
assert alpha > 0 assert alpha > 0

View File

@ -43,15 +43,22 @@ class Postnet(nn.Layer):
use_batch_norm=True, ): use_batch_norm=True, ):
"""Initialize postnet module. """Initialize postnet module.
Args: Parameters
idim (int): Dimension of the inputs. ----------
odim (int): Dimension of the outputs. idim : int
n_layers (int, optional): The number of layers. Dimension of the inputs.
n_filts (int, optional): The number of filter size. odim : int
n_units (int, optional): The number of filter channels. Dimension of the outputs.
use_batch_norm (bool, optional): Whether to use batch normalization.. n_layers : int, optional
dropout_rate (float, optional): Dropout rate.. The number of layers.
n_filts : int, optional
The number of filter size.
n_units : int, optional
The number of filter channels.
use_batch_norm : bool, optional
Whether to use batch normalization..
dropout_rate : float, optional
Dropout rate..
""" """
super(Postnet, self).__init__() super(Postnet, self).__init__()
self.postnet = nn.LayerList() self.postnet = nn.LayerList()
@ -111,11 +118,15 @@ class Postnet(nn.Layer):
def forward(self, xs): def forward(self, xs):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax). ----------
xs : Tensor
Batch of the sequences of padded input tensors (B, idim, Tmax).
Returns: Returns
Tensor: Batch of padded output tensor. (B, odim, Tmax). ----------
Tensor
Batch of padded output tensor. (B, odim, Tmax).
""" """
for i in six.moves.range(len(self.postnet)): for i in six.moves.range(len(self.postnet)):

View File

@ -15,10 +15,8 @@
import paddle import paddle
from paddle import nn from paddle import nn
from parakeet.modules.layer_norm import LayerNorm from parakeet.modules.layer_norm import LayerNorm
from parakeet.modules.masked_fill import masked_fill from parakeet.modules.masked_fill import masked_fill
from typeguard import check_argument_types from typeguard import check_argument_types
@ -43,13 +41,18 @@ class VariancePredictor(nn.Layer):
dropout_rate: float=0.5, ): dropout_rate: float=0.5, ):
"""Initilize duration predictor module. """Initilize duration predictor module.
Args: Parameters
idim (int): Input dimension. ----------
n_layers (int, optional): Number of convolutional layers. idim : int
n_chans (int, optional): Number of channels of convolutional layers. Input dimension.
kernel_size (int, optional): Kernel size of convolutional layers. n_layers : int, optional
dropout_rate (float, optional): Dropout rate. Number of convolutional layers.
n_chans : int, optional
Number of channels of convolutional layers.
kernel_size : int, optional
Kernel size of convolutional layers.
dropout_rate : float, optional
Dropout rate.
""" """
assert check_argument_types() assert check_argument_types()
super().__init__() super().__init__()
@ -70,26 +73,30 @@ class VariancePredictor(nn.Layer):
n_chans, dim=1), n_chans, dim=1),
nn.Dropout(dropout_rate), )) nn.Dropout(dropout_rate), ))
self.linear = nn.Linear(n_chans, 1) self.linear = nn.Linear(n_chans, 1, bias_attr=True)
def forward(self, xs: paddle.Tensor, def forward(self, xs: paddle.Tensor,
x_masks: paddle.Tensor=None) -> paddle.Tensor: x_masks: paddle.Tensor=None) -> paddle.Tensor:
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
xs (Tensor): Batch of input sequences (B, Tmax, idim). ----------
x_masks (ByteTensor, optional): xs : Tensor
Batch of input sequences (B, Tmax, idim).
x_masks : Tensor(bool), optional
Batch of masks indicating padded part (B, Tmax, 1). Batch of masks indicating padded part (B, Tmax, 1).
Returns: Returns
Tensor: Batch of predicted sequences (B, Tmax, 1). ----------
Tensor
Batch of predicted sequences (B, Tmax, 1).
""" """
# (B, idim, Tmax) # (B, idim, Tmax)
xs = xs.transpose([0, 2, 1]) xs = xs.transpose([0, 2, 1])
# (B, C, Tmax) # (B, C, Tmax)
for f in self.conv: for f in self.conv:
xs = f(xs) # (B, C, Tmax) # (B, C, Tmax)
xs = f(xs)
# (B, Tmax, 1) # (B, Tmax, 1)
xs = self.linear(xs.transpose([0, 2, 1])) xs = self.linear(xs.transpose([0, 2, 1]))

View File

@ -16,23 +16,22 @@
import math import math
import numpy import numpy
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.fluid.layers import sequence_mask
from parakeet.modules.masked_fill import masked_fill from parakeet.modules.masked_fill import masked_fill
class MultiHeadedAttention(nn.Layer): class MultiHeadedAttention(nn.Layer):
"""Multi-Head Attention layer. """Multi-Head Attention layer.
Args: Parameters
n_head (int): The number of heads. ----------
n_feat (int): The number of features. n_head : int
dropout_rate (float): Dropout rate. The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
""" """
def __init__(self, n_head, n_feat, dropout_rate): def __init__(self, n_head, n_feat, dropout_rate):
@ -42,33 +41,42 @@ class MultiHeadedAttention(nn.Layer):
# We assume d_v always equals d_k # We assume d_v always equals d_k
self.d_k = n_feat // n_head self.d_k = n_feat // n_head
self.h = n_head self.h = n_head
self.linear_q = nn.Linear(n_feat, n_feat) self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True)
self.linear_k = nn.Linear(n_feat, n_feat) self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True)
self.linear_v = nn.Linear(n_feat, n_feat) self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True)
self.linear_out = nn.Linear(n_feat, n_feat) self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True)
self.attn = None self.attn = None
self.dropout = nn.Dropout(p=dropout_rate) self.dropout = nn.Dropout(p=dropout_rate)
def forward_qkv(self, query, key, value): def forward_qkv(self, query, key, value):
"""Transform query, key and value. """Transform query, key and value.
Args: Parameters
query (paddle.Tensor): Query tensor (#batch, time1, size). ----------
key (paddle.Tensor): Key tensor (#batch, time2, size). query : paddle.Tensor
value (paddle.Tensor): Value tensor (#batch, time2, size). query tensor (#batch, time1, size).
key : paddle.Tensor
Returns: Key tensor (#batch, time2, size).
paddle.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). value : paddle.Tensor
paddle.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). Value tensor (#batch, time2, size).
paddle.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
Returns
----------
paddle.Tensor
Transformed query tensor (#batch, n_head, time1, d_k).
paddle.Tensor
Transformed key tensor (#batch, n_head, time2, d_k).
paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k).
""" """
n_batch = query.shape[0] n_batch = query.shape[0]
q = paddle.reshape( q = paddle.reshape(
self.linear_q(query), [n_batch, -1, self.h, self.d_k]) self.linear_q(query), [n_batch, -1, self.h, self.d_k])
k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k]) k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
v = paddle.reshape( v = paddle.reshape(
self.linear_v(value), [n_batch, -1, self.h, self.d_k]) self.linear_v(value), [n_batch, -1, self.h, self.d_k])
# (batch, head, time1, d_k) # (batch, head, time1, d_k)
q = q.transpose((0, 2, 1, 3)) q = q.transpose((0, 2, 1, 3))
# (batch, head, time2, d_k) # (batch, head, time2, d_k)
@ -80,44 +88,40 @@ class MultiHeadedAttention(nn.Layer):
def forward_attention(self, value, scores, mask=None): def forward_attention(self, value, scores, mask=None):
"""Compute attention context vector. """Compute attention context vector.
Args: Parameters
value (paddle.Tensor): Transformed value (#batch, n_head, time2, d_k). ----------
scores (paddle.Tensor): Attention score (#batch, n_head, time1, time2). value : paddle.Tensor
mask (paddle.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). Transformed value (#batch, n_head, time2, d_k).
scores : paddle.Tensor
Attention score (#batch, n_head, time1, time2).
mask : paddle.Tensor
Mask (#batch, 1, time2) or (#batch, time1, time2).
Returns: Returns
paddle.Tensor: Transformed value (#batch, time1, d_model) ----------
paddle.Tensor:
Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2). weighted by the attention score (#batch, time1, time2).
""" """
n_batch = value.shape[0] n_batch = value.shape[0]
softmax = paddle.nn.Softmax(axis=-1) softmax = paddle.nn.Softmax(axis=-1)
if mask is not None: if mask is not None:
mask = mask.unsqueeze(1) mask = mask.unsqueeze(1)
# mask 取反, pad 的位置变成 true之后 pad 的位置被替换为 0
mask = paddle.logical_not(mask) mask = paddle.logical_not(mask)
# mask = paddle.cast(mask, dtype='int64')
# mask ==1 的位置用 min_value 代替
# scores = scores.masked_fill(mask, min_value)
min_value = float( min_value = float(
numpy.finfo( numpy.finfo(
paddle.to_tensor( paddle.to_tensor(
0, dtype=scores.dtype).numpy().dtype).min) 0, dtype=scores.dtype).numpy().dtype).min)
scores = masked_fill(scores, mask, min_value) scores = masked_fill(scores, mask, min_value)
self.attn = softmax(scores) # (batch, head, time1, time2) # (batch, head, time1, time2)
self.attn = softmax(scores)
# 用value填充tensor中与mask中值为1位置相对应的元素 == 保留 mask 为0 的值
# self.attn = torch.softmax(scores, dim=-1).masked_fill(
# mask, 0.0
# ) # (batch, head, time1, time2)
# 保留 mask 为 0 的位置,其他变成 0
self.attn = masked_fill(self.attn, mask, 0.0) self.attn = masked_fill(self.attn, mask, 0.0)
else: else:
self.attn = softmax(scores) # (batch, head, time1, time2) # (batch, head, time1, time2)
# (batch, head, time1, time2) self.attn = softmax(scores)
# (batch, head, time1, time2)
p_attn = self.dropout(self.attn) p_attn = self.dropout(self.attn)
# (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k) # (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
x = paddle.matmul(p_attn, value) x = paddle.matmul(p_attn, value)
@ -130,16 +134,21 @@ class MultiHeadedAttention(nn.Layer):
def forward(self, query, key, value, mask=None): def forward(self, query, key, value, mask=None):
"""Compute scaled dot product attention. """Compute scaled dot product attention.
Args: Parameters
query (paddle.Tensor): Query tensor (#batch, time1, size). ----------
key (paddle.Tensor): Key tensor (#batch, time2, size). query : paddle.Tensor
value (paddle.Tensor): Value tensor (#batch, time2, size). Query tensor (#batch, time1, size).
mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or key : paddle.Tensor
(#batch, time1, time2). Key tensor (#batch, time2, size).
value : paddle.Tensor
Returns: Value tensor (#batch, time2, size).
paddle.Tensor: Output tensor (#batch, time1, d_model). mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
Returns
----------
paddle.Tensor
Output tensor (#batch, time1, d_model).
""" """
q, k, v = self.forward_qkv(query, key, value) q, k, v = self.forward_qkv(query, key, value)
scores = paddle.matmul(q, k.transpose( scores = paddle.matmul(q, k.transpose(

View File

@ -22,14 +22,16 @@ from paddle import nn
class PositionalEncoding(nn.Layer): class PositionalEncoding(nn.Layer):
"""Positional encoding. """Positional encoding.
Args: Parameters
d_model (int): Embedding dimension. ----------
dropout_rate (float): Dropout rate. d_model : int
max_len (int): Maximum input length. Embedding dimension.
reverse (bool): Whether to reverse the input position. Only for dropout_rate : float
the class LegacyRelPositionalEncoding. We remove it in the current Dropout rate.
class RelPositionalEncoding. max_len : int
Maximum input length.
reverse : bool
Whether to reverse the input position. Only for
""" """
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
@ -47,7 +49,6 @@ class PositionalEncoding(nn.Layer):
pe = paddle.zeros([x.shape[1], self.d_model]) pe = paddle.zeros([x.shape[1], self.d_model])
if self.reverse: if self.reverse:
# (x.shape[1],1)
position = paddle.arange( position = paddle.arange(
x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1) x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1)
else: else:
@ -65,12 +66,15 @@ class PositionalEncoding(nn.Layer):
def forward(self, x: paddle.Tensor): def forward(self, x: paddle.Tensor):
"""Add positional encoding. """Add positional encoding.
Args: Parameters
x (paddle.Tensor): Input tensor (batch, time, `*`). ----------
x : paddle.Tensor
Returns: Input tensor (batch, time, `*`).
paddle.Tensor: Encoded tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
""" """
self.extend_pe(x) self.extend_pe(x)
x = x * self.xscale + self.pe[:, :x.shape[1]] x = x * self.xscale + self.pe[:, :x.shape[1]]
@ -82,11 +86,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
See Sec. 3.2 https://arxiv.org/abs/1809.08895 See Sec. 3.2 https://arxiv.org/abs/1809.08895
Args: Parameters
d_model (int): Embedding dimension. ----------
dropout_rate (float): Dropout rate. d_model : int
max_len (int): Maximum input length. Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
""" """
def __init__(self, d_model, dropout_rate, max_len=5000): def __init__(self, d_model, dropout_rate, max_len=5000):
@ -106,12 +113,15 @@ class ScaledPositionalEncoding(PositionalEncoding):
def forward(self, x): def forward(self, x):
"""Add positional encoding. """Add positional encoding.
Args: Parameters
x (paddle.Tensor): Input tensor (batch, time, `*`). ----------
x : paddle.Tensor
Returns: Input tensor (batch, time, `*`).
paddle.Tensor: Encoded tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
""" """
self.extend_pe(x) self.extend_pe(x)
x = x + self.alpha * self.pe[:, :x.shape[1]] x = x + self.alpha * self.pe[:, :x.shape[1]]

View File

@ -12,19 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math
import numpy
import logging import logging
import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from paddle.fluid.layers import sequence_mask
import sys
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
@ -35,28 +27,44 @@ from parakeet.modules.fastspeech2_transformer.repeat import repeat
class Encoder(nn.Layer): class Encoder(nn.Layer):
"""Transformer encoder module. """Transformer encoder module.
Args: Parameters
idim (int): Input dimension. ----------
attention_dim (int): Dimention of attention. idim : int
attention_heads (int): The number of heads of multi head attention. Input dimension.
linear_units (int): The number of units of position-wise feed forward. attention_dim : int
num_blocks (int): The number of decoder blocks. Dimention of attention.
dropout_rate (float): Dropout rate. attention_heads : int
positional_dropout_rate (float): Dropout rate after adding positional encoding. The number of heads of multi head attention.
attention_dropout_rate (float): Dropout rate in attention. linear_units : int
input_layer (Union[str, paddle.nn.Layer]): Input layer type. The number of units of position-wise feed forward.
pos_enc_class (paddle.nn.Layer): Positional encoding module class. num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding` `PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before (bool): Whether to use layer_norm before the first block. normalize_before : bool
concat_after (bool): Whether to concat attention layer's input and output. Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied. if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x))) i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x) if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". positionwise_layer_type : str
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. "linear", "conv1d", or "conv1d-linear".
selfattention_layer_type (str): Encoder attention layer type. positionwise_conv_kernel_size : int
padding_idx (int): Padding idx for input_layer=embed. Kernel size of positionwise conv1d layer.
selfattention_layer_type : str
Encoder attention layer type.
padding_idx : int
Padding idx for input_layer=embed.
""" """
def __init__( def __init__(
@ -82,7 +90,8 @@ class Encoder(nn.Layer):
self.conv_subsampling_factor = 1 self.conv_subsampling_factor = 1
if input_layer == "linear": if input_layer == "linear":
self.embed = nn.Sequential( self.embed = nn.Sequential(
nn.Linear(idim, attention_dim), nn.Linear(
idim, attention_dim, bias_attr=True),
nn.LayerNorm(attention_dim), nn.LayerNorm(attention_dim),
nn.Dropout(dropout_rate), nn.Dropout(dropout_rate),
nn.ReLU(), nn.ReLU(),
@ -169,14 +178,19 @@ class Encoder(nn.Layer):
def forward(self, xs, masks): def forward(self, xs, masks):
"""Encode input sequence. """Encode input sequence.
Args: Parameters
xs (paddle.Tensor): Input tensor (#batch, time, idim). ----------
masks (paddle.Tensor): Mask tensor (#batch, time). xs : paddle.Tensor
Input tensor (#batch, time, idim).
Returns: masks : paddle.Tensor
paddle.Tensor: Output tensor (#batch, time, attention_dim). Mask tensor (#batch, time).
paddle.Tensor: Mask tensor (#batch, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
""" """
xs = self.embed(xs) xs = self.embed(xs)
xs, masks = self.encoders(xs, masks) xs, masks = self.encoders(xs, masks)
@ -187,16 +201,23 @@ class Encoder(nn.Layer):
def forward_one_step(self, xs, masks, cache=None): def forward_one_step(self, xs, masks, cache=None):
"""Encode input frame. """Encode input frame.
Args: Parameters
xs (paddle.Tensor): Input tensor. ----------
masks (paddle.Tensor): Mask tensor. xs : paddle.Tensor
cache (List[paddle.Tensor]): List of cache tensors. Input tensor.
masks : paddle.Tensor
Returns: Mask tensor.
paddle.Tensor: Output tensor. cache : List[paddle.Tensor]
paddle.Tensor: Mask tensor. List of cache tensors.
List[paddle.Tensor]: List of new cache tensors.
Returns
----------
paddle.Tensor
Output tensor.
paddle.Tensor
Mask tensor.
List[paddle.Tensor]
List of new cache tensors.
""" """
xs = self.embed(xs) xs = self.embed(xs)

View File

@ -14,28 +14,31 @@
"""Encoder self-attention layer definition.""" """Encoder self-attention layer definition."""
import paddle import paddle
from paddle import nn from paddle import nn
class EncoderLayer(nn.Layer): class EncoderLayer(nn.Layer):
"""Encoder layer module. """Encoder layer module.
Args: Parameters
size (int): Input dimension. ----------
self_attn (paddle.nn.Layer): Self-attention module instance. size : int
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance Input dimension.
can be used as the argument. self_attn : paddle.nn.Layer
feed_forward (paddle.nn.Layer): Feed-forward module instance. Self-attention module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance `MultiHeadedAttention` instance can be used as the argument.
can be used as the argument. feed_forward : paddle.nn.Layer
dropout_rate (float): Dropout rate. Feed-forward module instance.
normalize_before (bool): Whether to use layer_norm before the first block. `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
concat_after (bool): Whether to concat attention layer's input and output. dropout_rate : float
Dropout rate.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied. if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x))) i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x) if False, no additional linear will be applied. i.e. x -> x + att(x)
""" """
def __init__( def __init__(
@ -57,20 +60,26 @@ class EncoderLayer(nn.Layer):
self.normalize_before = normalize_before self.normalize_before = normalize_before
self.concat_after = concat_after self.concat_after = concat_after
if self.concat_after: if self.concat_after:
self.concat_linear = nn.Linear(size + size, size) self.concat_linear = nn.Linear(size + size, size, bias_attr=True)
def forward(self, x, mask, cache=None): def forward(self, x, mask, cache=None):
"""Compute encoded features. """Compute encoded features.
Args: Parameters
x_input (paddle.Tensor): Input tensor (#batch, time, size). ----------
mask (paddle.Tensor): Mask tensor for the input (#batch, time). x_input : paddle.Tensor
cache (paddle.Tensor): Cache tensor of the input (#batch, time - 1, size). Input tensor (#batch, time, size).
mask : paddle.Tensor
Returns: Mask tensor for the input (#batch, time).
paddle.Tensor: Output tensor (#batch, time, size). cache : paddle.Tensor
paddle.Tensor: Mask tensor (#batch, time). Cache tensor of the input (#batch, time - 1, size).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, size).
paddle.Tensor
Mask tensor (#batch, time).
""" """
residual = x residual = x
if self.normalize_before: if self.normalize_before:
@ -82,7 +91,6 @@ class EncoderLayer(nn.Layer):
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
x_q = x[:, -1:, :] x_q = x[:, -1:, :]
residual = residual[:, -1:, :] residual = residual[:, -1:, :]
# non-pad mask 变成 pad mask
mask = None if mask is None else mask[:, -1:, :] mask = None if mask is None else mask[:, -1:, :]
if self.concat_after: if self.concat_after:
@ -90,6 +98,7 @@ class EncoderLayer(nn.Layer):
(x, self.self_attn(x_q, x, x, mask)), axis=-1) (x, self.self_attn(x_q, x, x, mask)), axis=-1)
x = residual + self.concat_linear(x_concat) x = residual + self.concat_linear(x_concat)
else: else:
x = residual + self.dropout(self.self_attn(x_q, x, x, mask)) x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
if not self.normalize_before: if not self.normalize_before:
x = self.norm1(x) x = self.norm1(x)

View File

@ -32,11 +32,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
"""Initialize MultiLayeredConv1d module. """Initialize MultiLayeredConv1d module.
Args: Parameters
in_chans (int): Number of input channels. ----------
hidden_chans (int): Number of hidden channels. in_chans : int
kernel_size (int): Kernel size of conv1d. Number of input channels.
dropout_rate (float): Dropout rate. hidden_chans : int
Number of hidden channels.
kernel_size : int
Kernel size of conv1d.
dropout_rate : float
Dropout rate.
""" """
super(MultiLayeredConv1d, self).__init__() super(MultiLayeredConv1d, self).__init__()
@ -58,14 +63,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
x (paddle.Tensor): Batch of input tensors (B, T, in_chans). ----------
x : paddle.Tensor
Returns: Batch of input tensors (B, T, in_chans).
paddle.Tensor: Batch of output tensors (B, T, in_chans).
Returns
----------
paddle.Tensor
Batch of output tensors (B, T, in_chans).
""" """
# x = paddle.nn.ReLU(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
[0, 2, 1]) [0, 2, 1])
@ -81,12 +88,16 @@ class Conv1dLinear(paddle.nn.Layer):
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
"""Initialize Conv1dLinear module. """Initialize Conv1dLinear module.
Args: Parameters
in_chans (int): Number of input channels. ----------
hidden_chans (int): Number of hidden channels. in_chans : int
kernel_size (int): Kernel size of conv1d. Number of input channels.
dropout_rate (float): Dropout rate. hidden_chans : int
Number of hidden channels.
kernel_size : int
Kernel size of conv1d.
dropout_rate : float
Dropout rate.
""" """
super(Conv1dLinear, self).__init__() super(Conv1dLinear, self).__init__()
self.w_1 = paddle.nn.Conv1D( self.w_1 = paddle.nn.Conv1D(
@ -95,18 +106,22 @@ class Conv1dLinear(paddle.nn.Layer):
kernel_size, kernel_size,
stride=1, stride=1,
padding=(kernel_size - 1) // 2, ) padding=(kernel_size - 1) // 2, )
self.w_2 = paddle.nn.Linear(hidden_chans, in_chans) self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
self.dropout = paddle.nn.Dropout(dropout_rate) self.dropout = paddle.nn.Dropout(dropout_rate)
self.relu = paddle.nn.ReLU() self.relu = paddle.nn.ReLU()
def forward(self, x): def forward(self, x):
"""Calculate forward propagation. """Calculate forward propagation.
Args: Parameters
x (paddle.Tensor): Batch of input tensors (B, T, in_chans). ----------
x : paddle.Tensor
Batch of input tensors (B, T, in_chans).
Returns: Returns
paddle.Tensor: Batch of output tensors (B, T, in_chans). ----------
paddle.Tensor
Batch of output tensors (B, T, in_chans).
""" """
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])

View File

@ -19,11 +19,14 @@ import paddle
class PositionwiseFeedForward(paddle.nn.Layer): class PositionwiseFeedForward(paddle.nn.Layer):
"""Positionwise feed forward layer. """Positionwise feed forward layer.
Args: Parameters
idim (int): Input dimenstion. ----------
hidden_units (int): The number of hidden units. idim : int
dropout_rate (float): Dropout rate. Input dimenstion.
hidden_units : int
The number of hidden units.
dropout_rate : float
Dropout rate.
""" """
def __init__(self, def __init__(self,
@ -33,8 +36,8 @@ class PositionwiseFeedForward(paddle.nn.Layer):
activation=paddle.nn.ReLU()): activation=paddle.nn.ReLU()):
"""Construct an PositionwiseFeedForward object.""" """Construct an PositionwiseFeedForward object."""
super(PositionwiseFeedForward, self).__init__() super(PositionwiseFeedForward, self).__init__()
self.w_1 = paddle.nn.Linear(idim, hidden_units) self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
self.w_2 = paddle.nn.Linear(hidden_units, idim) self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
self.dropout = paddle.nn.Dropout(dropout_rate) self.dropout = paddle.nn.Dropout(dropout_rate)
self.activation = activation self.activation = activation

View File

@ -29,12 +29,16 @@ class MultiSequential(paddle.nn.Sequential):
def repeat(N, fn): def repeat(N, fn):
"""Repeat module N times. """Repeat module N times.
Args: Parameters
N (int): Number of repeat time. ----------
fn (Callable): Function to generate module. N : int
Number of repeat time.
Returns: fn : Callable
MultiSequential: Repeated model instance. Function to generate module.
Returns
----------
MultiSequential
Repeated model instance.
""" """
return MultiSequential(* [fn(n) for n in range(N)]) return MultiSequential(* [fn(n) for n in range(N)])

View File

@ -19,10 +19,12 @@ import paddle
class LayerNorm(paddle.nn.LayerNorm): class LayerNorm(paddle.nn.LayerNorm):
"""Layer normalization module. """Layer normalization module.
Args: Parameters
nout (int): Output dim size. ----------
dim (int): Dimension to be normalized. nout : int
Output dim size.
dim : int
Dimension to be normalized.
""" """
def __init__(self, nout, dim=-1): def __init__(self, nout, dim=-1):
@ -33,12 +35,15 @@ class LayerNorm(paddle.nn.LayerNorm):
def forward(self, x): def forward(self, x):
"""Apply layer normalization. """Apply layer normalization.
Args: Parameters
x (torch.Tensor): Input tensor. ----------
x : paddle.Tensor
Returns: Input tensor.
torch.Tensor: Normalized tensor.
Returns
----------
paddle.Tensor
Normalized tensor.
""" """
if self.dim == -1: if self.dim == -1:
return super(LayerNorm, self).forward(x) return super(LayerNorm, self).forward(x)

View File

@ -28,7 +28,7 @@ def is_broadcastable(shp1, shp2):
def masked_fill(xs: paddle.Tensor, def masked_fill(xs: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
value: Union[float, int]): value: Union[float, int]):
# assert is_broadcastable(xs.shape, mask.shape) is True assert is_broadcastable(xs.shape, mask.shape) is True
bshape = paddle.broadcast_shape(xs.shape, mask.shape) bshape = paddle.broadcast_shape(xs.shape, mask.shape)
mask = mask.broadcast_to(bshape) mask = mask.broadcast_to(bshape)
trues = paddle.ones_like(xs) * value trues = paddle.ones_like(xs) * value

View File

@ -13,20 +13,27 @@
# limitations under the License. # limitations under the License.
import paddle import paddle
from paddle import nn
from typeguard import check_argument_types
# 按照这个 batch 里面最长的补零
def pad_list(xs, pad_value): def pad_list(xs, pad_value):
"""Perform padding for the list of tensors. """Perform padding for the list of tensors.
Args: Parameters
xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. ----------
pad_value (float): Value for padding. xs : List[Tensor]
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value : float)
Value for padding.
Returns: Returns
Tensor: Padded tensor (B, Tmax, `*`). ----------
Tensor
Padded tensor (B, Tmax, `*`).
Examples: Examples
----------
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
>>> x >>> x
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
@ -34,11 +41,9 @@ def pad_list(xs, pad_value):
tensor([[1., 1., 1., 1.], tensor([[1., 1., 1., 1.],
[1., 1., 0., 0.], [1., 1., 0., 0.],
[1., 0., 0., 0.]]) [1., 0., 0., 0.]])
""" """
n_batch = len(xs) n_batch = len(xs)
max_len = max(x.shape[0] for x in xs) max_len = max(x.shape[0] for x in xs)
# pad = xs[0].new(n_batch, max_len, *xs[0].shape[1:]).fill_(pad_value)
pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value) pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)
for i in range(n_batch): for i in range(n_batch):
@ -50,13 +55,18 @@ def pad_list(xs, pad_value):
def make_pad_mask(lengths, length_dim=-1): def make_pad_mask(lengths, length_dim=-1):
"""Make mask tensor containing indices of padded part. """Make mask tensor containing indices of padded part.
Args: Parameters
lengths (LongTensor or List): Batch of lengths (B,). ----------
lengths : LongTensor or List
Batch of lengths (B,).
Returns: Returns
Tensor: Mask tensor containing indices of padded part bool. ----------
Tensor(bool)
Mask tensor containing indices of padded part bool.
Examples: Examples
----------
With only lengths. With only lengths.
>>> lengths = [5, 3, 2] >>> lengths = [5, 3, 2]
@ -64,7 +74,6 @@ def make_pad_mask(lengths, length_dim=-1):
masks = [[0, 0, 0, 0 ,0], masks = [[0, 0, 0, 0 ,0],
[0, 0, 0, 1, 1], [0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]] [0, 0, 1, 1, 1]]
""" """
if length_dim == 0: if length_dim == 0:
raise ValueError("length_dim cannot be 0: {}".format(length_dim)) raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@ -88,17 +97,24 @@ def make_pad_mask(lengths, length_dim=-1):
def make_non_pad_mask(lengths, length_dim=-1): def make_non_pad_mask(lengths, length_dim=-1):
"""Make mask tensor containing indices of non-padded part. """Make mask tensor containing indices of non-padded part.
Args: Parameters
lengths (LongTensor or List): Batch of lengths (B,). ----------
xs (Tensor, optional): The reference tensor. lengths : LongTensor or List
Batch of lengths (B,).
xs : Tensor, optional
The reference tensor.
If set, masks will be the same shape as this tensor. If set, masks will be the same shape as this tensor.
length_dim (int, optional): Dimension indicator of the above tensor. length_dim : int, optional
Dimension indicator of the above tensor.
See the example. See the example.
Returns: Returns
ByteTensor: mask tensor containing indices of padded part bool. ----------
Tensor(bool)
mask tensor containing indices of padded part bool.
Examples: Examples
----------
With only lengths. With only lengths.
>>> lengths = [5, 3, 2] >>> lengths = [5, 3, 2]
@ -106,6 +122,37 @@ def make_non_pad_mask(lengths, length_dim=-1):
masks = [[1, 1, 1, 1 ,1], masks = [[1, 1, 1, 1 ,1],
[1, 1, 1, 0, 0], [1, 1, 1, 0, 0],
[1, 1, 0, 0, 0]] [1, 1, 0, 0, 0]]
""" """
return paddle.logical_not(make_pad_mask(lengths, length_dim)) return paddle.logical_not(make_pad_mask(lengths, length_dim))
def initialize(model: nn.Layer, init: str):
"""Initialize weights of a neural network module.
Parameters are initialized using the given method or distribution.
Custom initialization routines can be implemented into submodules
Parameters
----------
model : paddle.nn.Layer
Target.
init : str
Method of initialization.
"""
assert check_argument_types()
if init == "xavier_uniform":
nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
nn.initializer.Constant())
elif init == "xavier_normal":
nn.initializer.set_global_initializer(nn.initializer.XavierNormal(),
nn.initializer.Constant())
elif init == "kaiming_uniform":
nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(),
nn.initializer.Constant())
elif init == "kaiming_normal":
nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(),
nn.initializer.Constant())
else:
raise ValueError("Unknown initialization: " + init)