From 6553d1d72363976424c93b0c7dcef3fc6573d64b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 13 Jul 2021 07:55:56 +0000 Subject: [PATCH] format docstrings --- parakeet/models/fastspeech2_new.py | 230 +++++++++++------- .../duration_predictor.py | 88 ++++--- .../fastspeech2_predictor/length_regulator.py | 27 +- .../modules/fastspeech2_predictor/postnet.py | 37 ++- .../variance_predictor.py | 41 ++-- .../fastspeech2_transformer/attention.py | 113 +++++---- .../fastspeech2_transformer/embedding.py | 58 +++-- .../fastspeech2_transformer/encoder.py | 109 +++++---- .../fastspeech2_transformer/encoder_layer.py | 55 +++-- .../multi_layer_conv.py | 59 +++-- .../positionwise_feed_forward.py | 17 +- .../modules/fastspeech2_transformer/repeat.py | 16 +- parakeet/modules/layer_norm.py | 23 +- parakeet/modules/masked_fill.py | 2 +- parakeet/modules/nets_utils.py | 93 +++++-- 15 files changed, 597 insertions(+), 371 deletions(-) diff --git a/parakeet/models/fastspeech2_new.py b/parakeet/models/fastspeech2_new.py index b530e9c..576bfab 100644 --- a/parakeet/models/fastspeech2_new.py +++ b/parakeet/models/fastspeech2_new.py @@ -12,28 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. """Fastspeech2 related modules for paddle""" -import logging -import numpy as np from typing import Dict from typing import Sequence from typing import Tuple - from typeguard import check_argument_types import paddle +import numpy as np from paddle import nn - from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss -from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator from parakeet.modules.fastspeech2_predictor.postnet import Postnet -from parakeet.modules.nets_utils import make_non_pad_mask -from parakeet.modules.nets_utils import make_pad_mask +from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder +from parakeet.modules.nets_utils import initialize +from parakeet.modules.nets_utils import make_non_pad_mask +from parakeet.modules.nets_utils import make_pad_mask class FastSpeech2(nn.Layer): @@ -155,7 +153,6 @@ class FastSpeech2(nn.Layer): positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) else: - print("encoder_type:", encoder_type) raise ValueError(f"{encoder_type} is not supported.") # define duration predictor @@ -236,6 +233,12 @@ class FastSpeech2(nn.Layer): use_batch_norm=use_batch_norm, dropout_rate=postnet_dropout_rate, )) + # initialize parameters + self._reset_parameters( + init_type=init_type, + init_enc_alpha=init_enc_alpha, + init_dec_alpha=init_dec_alpha, ) + # define criterions self.criterion = FastSpeech2Loss( use_masking=use_masking, use_weighted_masking=use_weighted_masking) @@ -253,25 +256,37 @@ class FastSpeech2(nn.Layer): energy: paddle.Tensor, energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[ str, paddle.Tensor], paddle.Tensor]: - # """Calculate forward propagation. + """Calculate forward propagation. - # Args: - # text (LongTensor): Batch of padded token ids (B, Tmax). - # text_lengths (LongTensor): Batch of lengths of each input (B,). - # speech (Tensor): Batch of padded target features (B, Lmax, odim). - # speech_lengths (LongTensor): Batch of the lengths of each target (B,). - # durations (LongTensor): Batch of padded durations (B, Tmax + 1). - # durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1). - # pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1). - # pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1). - # energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1). - # energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1). - # Returns: - # Tensor: Loss scalar value. - # Dict: Statistics to be monitored. - # Tensor: Weight value. - - # """ + Parameters + ---------- + text : LongTensor + Batch of padded token ids (B, Tmax). + text_lengths : LongTensor) + Batch of lengths of each input (B,). + speech : Tensor + Batch of padded target features (B, Lmax, odim). + speech_lengths : LongTensor + Batch of the lengths of each target (B,). + durations : LongTensor + Batch of padded durations (B, Tmax + 1). + durations_lengths : LongTensor + Batch of duration lengths (B, Tmax + 1). + pitch : Tensor + Batch of padded token-averaged pitch (B, Tmax + 1, 1). + pitch_lengths : LongTensor + Batch of pitch lengths (B, Tmax + 1). + energy : Tensor + Batch of padded token-averaged energy (B, Tmax + 1, 1). + energy_lengths : LongTensor + Batch of energy lengths (B, Tmax + 1). + Returns + ---------- + Tensor + Loss scalar value. + Dict + Statistics to be monitored. + """ text = text[:, :text_lengths.max()] # for data-parallel speech = speech[:, :speech_lengths.max()] # for data-parallel durations = durations[:, :durations_lengths.max()] # for data-parallel @@ -282,16 +297,11 @@ class FastSpeech2(nn.Layer): # Add eos at the last of sequence # xs = F.pad(text, [0, 1], "constant", self.padding_idx) - print("xs.shape in fastspeech2.py before:", text.shape, text) xs = np.pad(text.numpy(), pad_width=((0, 0), (0, 1)), mode="constant", constant_values=self.padding_idx) xs = paddle.to_tensor(xs) - print("xs.shape in fastspeech2.py end:", xs.shape, xs) - # my_pad = nn.Pad1D(padding=[0, 1], mode="constant", value=self.padding_idx) - # xs = my_pad(text) - # 是否会数组越界? xs 是否能取到 l -> 可以,因为上一步补充了一个 padding_idx,又变成了 eos for i, l in enumerate(text_lengths): xs[i, l] = self.eos ilens = text_lengths + 1 @@ -302,23 +312,16 @@ class FastSpeech2(nn.Layer): # forward propagation before_outs, after_outs, d_outs, p_outs, e_outs = self._forward( xs, ilens, ys, olens, ds, ps, es, is_inference=False) - print("d_outs in paddle:", d_outs) - print("p_outs in paddle:", p_outs) - print("e_outs in paddle:", e_outs) - # modify mod part of groundtruth if self.reduction_factor > 1: - # 需要改 olens = paddle.to_tensor([ olen - olen % self.reduction_factor for olen in olens.numpy() ]) max_olen = max(olens) ys = ys[:, :max_olen] - # calculate loss if self.postnet is None: after_outs = None - # calculate loss l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, @@ -363,9 +366,8 @@ class FastSpeech2(nn.Layer): alpha: float=1.0, ) -> Sequence[paddle.Tensor]: # forward encoder x_masks = self._source_mask(ilens) - print("xs.shape in fastspeech2.py:", xs.shape) - hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim) + hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim) # forward duration predictor and variance predictors d_masks = make_pad_mask(ilens) @@ -377,10 +379,11 @@ class FastSpeech2(nn.Layer): e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1)) else: e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1)) - print("p_outs.shape:", p_outs.shape) + if is_inference: d_outs = self.duration_predictor.inference(hs, d_masks) # (B, Tmax) + # print("d_outs:",d_outs) # use prediction in inference # (B, Tmax, 1) @@ -404,7 +407,6 @@ class FastSpeech2(nn.Layer): # forward decoder if olens is not None and not is_inference: if self.reduction_factor > 1: - # 直接to_paddle ,维度会增加 1,需要先转成 numpy olens_in = paddle.to_tensor( [olen // self.reduction_factor for olen in olens.numpy()]) else: @@ -412,9 +414,10 @@ class FastSpeech2(nn.Layer): h_masks = self._source_mask(olens_in) else: h_masks = None - zs, _ = self.decoder(hs, h_masks) # (B, Lmax, adim) - before_outs = self.feat_out(zs).reshape( - (zs.shape[0], -1, self.odim)) # (B, Lmax, odim) + # (B, Lmax, adim) + zs, _ = self.decoder(hs, h_masks) + # (B, Lmax, odim) + before_outs = self.feat_out(zs).reshape((zs.shape[0], -1, self.odim)) # postnet -> (B, Lmax//r * r, odim) if self.postnet is None: @@ -437,20 +440,30 @@ class FastSpeech2(nn.Layer): paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Args: - text (LongTensor): Input sequence of characters (T,). - speech (Tensor, optional): Feature sequence to extract style (N, idim). - durations (LongTensor, optional): Groundtruth of duration (T + 1,). - pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1). - energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1). - alpha (float, optional): Alpha to control the speed. - use_teacher_forcing (bool, optional): Whether to use teacher forcing. - If true, groundtruth of duration, pitch and energy will be used. + Parameters + ---------- + text : LongTensor + Input sequence of characters (T,). + speech : Tensor, optional + Feature sequence to extract style (N, idim). + durations : LongTensor, optional + Groundtruth of duration (T + 1,). + pitch : Tensor, optional + Groundtruth of token-averaged pitch (T + 1, 1). + energy : Tensor, optional + Groundtruth of token-averaged energy (T + 1, 1). + alpha : float, optional + Alpha to control the speed. + use_teacher_forcing : bool, optional + Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. - Returns: - Tensor: Output sequence of features (L, odim). - None: Dummy for compatibility. - None: Dummy for compatibility. + Returns + ---------- + Tensor + Output sequence of features (L, odim). + None + Dummy for compatibility. """ x, y = text, speech @@ -460,13 +473,15 @@ class FastSpeech2(nn.Layer): x = np.pad(text.numpy(), pad_width=((0, 1)), mode="constant", - constant_values=self.padding_idx) + constant_values=self.eos) + x = paddle.to_tensor(x) # setup batch axis ilens = paddle.to_tensor( [x.shape[0]], dtype=paddle.int64, place=x.place) xs, ys = x.unsqueeze(0), None + if y is not None: ys = y.unsqueeze(0) @@ -493,14 +508,19 @@ class FastSpeech2(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Args: - ilens (LongTensor): Batch of lengths (B,). + Parameters + ---------- + ilens : LongTensor + Batch of lengths (B,). - Returns: - Tensor: Mask tensor for self-attention. + Returns + ------- + Tensor + Mask tensor for self-attention. dtype=paddle.bool - Examples: + Examples + ------- >>> ilens = [5, 3] >>> self._source_mask(ilens) tensor([[[1, 1, 1, 1, 1], @@ -510,6 +530,29 @@ class FastSpeech2(nn.Layer): x_masks = make_non_pad_mask(ilens) return x_masks.unsqueeze(-2) + def _reset_parameters(self, + init_type: str, + init_enc_alpha: float, + init_dec_alpha: float): + # initialize parameters + initialize(self, init_type) + + # initialize alpha in scaled positional encoding + if self.encoder_type == "transformer" and self.use_scaled_pos_enc: + init_enc_alpha = paddle.to_tensor(init_enc_alpha) + self.encoder.embed[-1].alpha = paddle.create_parameter( + shape=init_enc_alpha.shape, + dtype=str(init_enc_alpha.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign( + init_enc_alpha)) + if self.decoder_type == "transformer" and self.use_scaled_pos_enc: + init_dec_alpha = paddle.to_tensor(init_dec_alpha) + self.decoder.embed[-1].alpha = paddle.create_parameter( + shape=init_dec_alpha.shape, + dtype=str(init_dec_alpha.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign( + init_dec_alpha)) + class FastSpeech2Loss(nn.Layer): """Loss function module for FastSpeech2.""" @@ -519,12 +562,12 @@ class FastSpeech2Loss(nn.Layer): use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. - Args: - use_masking (bool): + Parameters + ---------- + use_masking : bool Whether to apply masking for padded part in loss calculation. - use_weighted_masking (bool): + use_weighted_masking : bool Whether to weighted masking in loss calculation. - """ assert check_argument_types() super().__init__() @@ -555,24 +598,41 @@ class FastSpeech2Loss(nn.Layer): paddle.Tensor, paddle.Tensor]: """Calculate forward propagation. - Args: - after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim). - before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim). - d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax). - p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1). - ys (Tensor): Batch of target features (B, Lmax, odim). - ds (LongTensor): Batch of durations (B, Tmax). - ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1). - es (Tensor): Batch of target token-averaged energy (B, Tmax, 1). - ilens (LongTensor): Batch of the lengths of each input (B,). - olens (LongTensor): Batch of the lengths of each target (B,). + Parameters + ---------- + after_outs : Tensor + Batch of outputs after postnets (B, Lmax, odim). + before_outs : Tensor + Batch of outputs before postnets (B, Lmax, odim). + d_outs : LongTensor + Batch of outputs of duration predictor (B, Tmax). + p_outs : Tensor + Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs : Tensor + Batch of outputs of energy predictor (B, Tmax, 1). + ys : Tensor + Batch of target features (B, Lmax, odim). + ds : LongTensor + Batch of durations (B, Tmax). + ps : Tensor + Batch of target token-averaged pitch (B, Tmax, 1). + es : Tensor + Batch of target token-averaged energy (B, Tmax, 1). + ilens : LongTensor + Batch of the lengths of each input (B,). + olens : LongTensor + Batch of the lengths of each target (B,). - Returns: - Tensor: L1 loss value. - Tensor: Duration predictor loss value. - Tensor: Pitch predictor loss value. - Tensor: Energy predictor loss value. + Returns + ---------- + Tensor + L1 loss value. + Tensor + Duration predictor loss value. + Tensor + Pitch predictor loss value. + Tensor + Energy predictor loss value. """ # apply mask to remove padded part diff --git a/parakeet/modules/fastspeech2_predictor/duration_predictor.py b/parakeet/modules/fastspeech2_predictor/duration_predictor.py index 8bd74b4..288df2f 100644 --- a/parakeet/modules/fastspeech2_predictor/duration_predictor.py +++ b/parakeet/modules/fastspeech2_predictor/duration_predictor.py @@ -15,7 +15,6 @@ import paddle from paddle import nn - from parakeet.modules.layer_norm import LayerNorm from parakeet.modules.masked_fill import masked_fill @@ -31,7 +30,8 @@ class DurationPredictor(nn.Layer): .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: https://arxiv.org/pdf/1905.09263.pdf - Note: + Note + ---------- The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the outputs are calculated in log domain but in `inference`, @@ -48,13 +48,20 @@ class DurationPredictor(nn.Layer): offset=1.0): """Initilize duration predictor module. - Args: - idim (int): Input dimension. - n_layers (int, optional): Number of convolutional layers. - n_chans (int, optional): Number of channels of convolutional layers. - kernel_size (int, optional): Kernel size of convolutional layers. - dropout_rate (float, optional): Dropout rate. - offset (float, optional): Offset value to avoid nan in log domain. + Parameters + ---------- + idim : int + Input dimension. + n_layers : int, optional + Number of convolutional layers. + n_chans : int, optional + Number of channels of convolutional layers. + kernel_size : int, optional + Kernel size of convolutional layers. + dropout_rate : float, optional + Dropout rate. + offset : float, optional + Offset value to avoid nan in log domain. """ super(DurationPredictor, self).__init__() @@ -74,7 +81,7 @@ class DurationPredictor(nn.Layer): LayerNorm( n_chans, dim=1), nn.Dropout(dropout_rate), )) - self.linear = nn.Linear(n_chans, 1) + self.linear = nn.Linear(n_chans, 1, bias_attr=True) def _forward(self, xs, x_masks=None, is_inference=False): # (B, idim, Tmax) @@ -83,7 +90,7 @@ class DurationPredictor(nn.Layer): for f in self.conv: xs = f(xs) - # NOTE: calculate in log domain + # NOTE: calculate in log domain # (B, Tmax) xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1) @@ -99,28 +106,34 @@ class DurationPredictor(nn.Layer): def forward(self, xs, x_masks=None): """Calculate forward propagation. - Args: - xs (Tensor): Batch of input sequences (B, Tmax, idim). - x_masks (ByteTensor, optional): + Parameters + ---------- + xs : Tensor + Batch of input sequences (B, Tmax, idim). + x_masks : ByteTensor, optional Batch of masks indicating padded part (B, Tmax). - Returns: - Tensor: Batch of predicted durations in log domain (B, Tmax). - + Returns + ---------- + Tensor + Batch of predicted durations in log domain (B, Tmax). """ return self._forward(xs, x_masks, False) def inference(self, xs, x_masks=None): """Inference duration. - Args: - xs (Tensor): Batch of input sequences (B, Tmax, idim). - x_masks (ByteTensor, optional): + Parameters + ---------- + xs : Tensor + Batch of input sequences (B, Tmax, idim). + x_masks : Tensor(bool), optional Batch of masks indicating padded part (B, Tmax). - Returns: - LongTensor: Batch of predicted durations in linear domain int64 (B, Tmax). - + Returns + ---------- + LongTensor + Batch of predicted durations in linear domain int64 (B, Tmax). """ return self._forward(xs, x_masks, True) @@ -135,10 +148,12 @@ class DurationPredictorLoss(nn.Layer): def __init__(self, offset=1.0, reduction="mean"): """Initilize duration predictor loss module. - Args: - offset (float, optional): Offset value to avoid nan in log domain. - reduction (str): Reduction type in loss calculation. - + Parameters + ---------- + offset : float, optional + Offset value to avoid nan in log domain. + reduction : str + Reduction type in loss calculation. """ super(DurationPredictorLoss, self).__init__() self.criterion = nn.MSELoss(reduction=reduction) @@ -147,16 +162,21 @@ class DurationPredictorLoss(nn.Layer): def forward(self, outputs, targets): """Calculate forward propagation. - Args: - outputs (Tensor): Batch of prediction durations in log domain (B, T) - targets (LongTensor): Batch of groundtruth durations in linear domain (B, T) + Parameters + ---------- + outputs : Tensor + Batch of prediction durations in log domain (B, T) + targets : LongTensor + Batch of groundtruth durations in linear domain (B, T) - Returns: - Tensor: Mean squared error loss value. + Returns + ---------- + Tensor + Mean squared error loss value. - Note: + Note + ---------- `outputs` is in log domain but `targets` is in linear domain. - """ # NOTE: outputs is in log domain while targets in linear targets = paddle.log(targets.cast(dtype='float32') + self.offset) diff --git a/parakeet/modules/fastspeech2_predictor/length_regulator.py b/parakeet/modules/fastspeech2_predictor/length_regulator.py index e32071c..62b29a3 100644 --- a/parakeet/modules/fastspeech2_predictor/length_regulator.py +++ b/parakeet/modules/fastspeech2_predictor/length_regulator.py @@ -13,8 +13,6 @@ # limitations under the License. """Length regulator related modules.""" -import logging - import numpy as np import paddle from paddle import nn @@ -37,8 +35,10 @@ class LengthRegulator(nn.Layer): def __init__(self, pad_value=0.0): """Initilize length regulator module. - Args: - pad_value (float, optional): Value used for padding. + Parameters + ---------- + pad_value : float, optional + Value used for padding. """ super().__init__() @@ -68,14 +68,19 @@ class LengthRegulator(nn.Layer): def forward(self, xs, ds, alpha=1.0): """Calculate forward propagation. - Args: - xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds (LongTensor): Batch of durations of each frame (B, T). - alpha (float, optional): Alpha value to control speed of speech. - - Returns: - Tensor: replicated input tensor based on durations (B, T*, D). + Parameters + ---------- + xs : Tensor + Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds : LongTensor + Batch of durations of each frame (B, T). + alpha : float, optional + Alpha value to control speed of speech. + Returns + ---------- + Tensor + replicated input tensor based on durations (B, T*, D). """ if alpha != 1.0: assert alpha > 0 diff --git a/parakeet/modules/fastspeech2_predictor/postnet.py b/parakeet/modules/fastspeech2_predictor/postnet.py index 403e48c..fe9fd21 100644 --- a/parakeet/modules/fastspeech2_predictor/postnet.py +++ b/parakeet/modules/fastspeech2_predictor/postnet.py @@ -43,15 +43,22 @@ class Postnet(nn.Layer): use_batch_norm=True, ): """Initialize postnet module. - Args: - idim (int): Dimension of the inputs. - odim (int): Dimension of the outputs. - n_layers (int, optional): The number of layers. - n_filts (int, optional): The number of filter size. - n_units (int, optional): The number of filter channels. - use_batch_norm (bool, optional): Whether to use batch normalization.. - dropout_rate (float, optional): Dropout rate.. - + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + n_layers : int, optional + The number of layers. + n_filts : int, optional + The number of filter size. + n_units : int, optional + The number of filter channels. + use_batch_norm : bool, optional + Whether to use batch normalization.. + dropout_rate : float, optional + Dropout rate.. """ super(Postnet, self).__init__() self.postnet = nn.LayerList() @@ -111,11 +118,15 @@ class Postnet(nn.Layer): def forward(self, xs): """Calculate forward propagation. - Args: - xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax). + Parameters + ---------- + xs : Tensor + Batch of the sequences of padded input tensors (B, idim, Tmax). - Returns: - Tensor: Batch of padded output tensor. (B, odim, Tmax). + Returns + ---------- + Tensor + Batch of padded output tensor. (B, odim, Tmax). """ for i in six.moves.range(len(self.postnet)): diff --git a/parakeet/modules/fastspeech2_predictor/variance_predictor.py b/parakeet/modules/fastspeech2_predictor/variance_predictor.py index 751f3a6..5cbc091 100644 --- a/parakeet/modules/fastspeech2_predictor/variance_predictor.py +++ b/parakeet/modules/fastspeech2_predictor/variance_predictor.py @@ -15,10 +15,8 @@ import paddle from paddle import nn - from parakeet.modules.layer_norm import LayerNorm from parakeet.modules.masked_fill import masked_fill - from typeguard import check_argument_types @@ -43,13 +41,18 @@ class VariancePredictor(nn.Layer): dropout_rate: float=0.5, ): """Initilize duration predictor module. - Args: - idim (int): Input dimension. - n_layers (int, optional): Number of convolutional layers. - n_chans (int, optional): Number of channels of convolutional layers. - kernel_size (int, optional): Kernel size of convolutional layers. - dropout_rate (float, optional): Dropout rate. - + Parameters + ---------- + idim : int + Input dimension. + n_layers : int, optional + Number of convolutional layers. + n_chans : int, optional + Number of channels of convolutional layers. + kernel_size : int, optional + Kernel size of convolutional layers. + dropout_rate : float, optional + Dropout rate. """ assert check_argument_types() super().__init__() @@ -70,26 +73,30 @@ class VariancePredictor(nn.Layer): n_chans, dim=1), nn.Dropout(dropout_rate), )) - self.linear = nn.Linear(n_chans, 1) + self.linear = nn.Linear(n_chans, 1, bias_attr=True) def forward(self, xs: paddle.Tensor, x_masks: paddle.Tensor=None) -> paddle.Tensor: """Calculate forward propagation. - Args: - xs (Tensor): Batch of input sequences (B, Tmax, idim). - x_masks (ByteTensor, optional): + Parameters + ---------- + xs : Tensor + Batch of input sequences (B, Tmax, idim). + x_masks : Tensor(bool), optional Batch of masks indicating padded part (B, Tmax, 1). - Returns: - Tensor: Batch of predicted sequences (B, Tmax, 1). - + Returns + ---------- + Tensor + Batch of predicted sequences (B, Tmax, 1). """ # (B, idim, Tmax) xs = xs.transpose([0, 2, 1]) # (B, C, Tmax) for f in self.conv: - xs = f(xs) # (B, C, Tmax) + # (B, C, Tmax) + xs = f(xs) # (B, Tmax, 1) xs = self.linear(xs.transpose([0, 2, 1])) diff --git a/parakeet/modules/fastspeech2_transformer/attention.py b/parakeet/modules/fastspeech2_transformer/attention.py index 4e819c4..3c04c6c 100644 --- a/parakeet/modules/fastspeech2_transformer/attention.py +++ b/parakeet/modules/fastspeech2_transformer/attention.py @@ -16,23 +16,22 @@ import math import numpy - import paddle from paddle import nn - -from paddle.fluid.layers import sequence_mask - from parakeet.modules.masked_fill import masked_fill class MultiHeadedAttention(nn.Layer): """Multi-Head Attention layer. - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - + Parameters + ---------- + n_head : int + The number of heads. + n_feat : int + The number of features. + dropout_rate : float + Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -42,33 +41,42 @@ class MultiHeadedAttention(nn.Layer): # We assume d_v always equals d_k self.d_k = n_feat // n_head self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True) + self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True) + self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True) + self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True) self.attn = None self.dropout = nn.Dropout(p=dropout_rate) def forward_qkv(self, query, key, value): """Transform query, key and value. - Args: - query (paddle.Tensor): Query tensor (#batch, time1, size). - key (paddle.Tensor): Key tensor (#batch, time2, size). - value (paddle.Tensor): Value tensor (#batch, time2, size). - - Returns: - paddle.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). - paddle.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). - paddle.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + Parameters + ---------- + query : paddle.Tensor + query tensor (#batch, time1, size). + key : paddle.Tensor + Key tensor (#batch, time2, size). + value : paddle.Tensor + Value tensor (#batch, time2, size). + Returns + ---------- + paddle.Tensor + Transformed query tensor (#batch, n_head, time1, d_k). + paddle.Tensor + Transformed key tensor (#batch, n_head, time2, d_k). + paddle.Tensor + Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = query.shape[0] + q = paddle.reshape( self.linear_q(query), [n_batch, -1, self.h, self.d_k]) k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k]) v = paddle.reshape( self.linear_v(value), [n_batch, -1, self.h, self.d_k]) + # (batch, head, time1, d_k) q = q.transpose((0, 2, 1, 3)) # (batch, head, time2, d_k) @@ -80,44 +88,40 @@ class MultiHeadedAttention(nn.Layer): def forward_attention(self, value, scores, mask=None): """Compute attention context vector. - Args: - value (paddle.Tensor): Transformed value (#batch, n_head, time2, d_k). - scores (paddle.Tensor): Attention score (#batch, n_head, time1, time2). - mask (paddle.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + Parameters + ---------- + value : paddle.Tensor + Transformed value (#batch, n_head, time2, d_k). + scores : paddle.Tensor + Attention score (#batch, n_head, time1, time2). + mask : paddle.Tensor + Mask (#batch, 1, time2) or (#batch, time1, time2). - Returns: - paddle.Tensor: Transformed value (#batch, time1, d_model) + Returns + ---------- + paddle.Tensor: + Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). - """ n_batch = value.shape[0] softmax = paddle.nn.Softmax(axis=-1) if mask is not None: mask = mask.unsqueeze(1) - # mask 取反, pad 的位置变成 true,之后 pad 的位置被替换为 0 mask = paddle.logical_not(mask) - - # mask = paddle.cast(mask, dtype='int64') - # mask ==1 的位置用 min_value 代替 - # scores = scores.masked_fill(mask, min_value) min_value = float( numpy.finfo( paddle.to_tensor( 0, dtype=scores.dtype).numpy().dtype).min) scores = masked_fill(scores, mask, min_value) - self.attn = softmax(scores) # (batch, head, time1, time2) - - # 用value填充tensor中与mask中值为1位置相对应的元素 == 保留 mask 为0 的值 - # self.attn = torch.softmax(scores, dim=-1).masked_fill( - # mask, 0.0 - # ) # (batch, head, time1, time2) - # 保留 mask 为 0 的位置,其他变成 0 + # (batch, head, time1, time2) + self.attn = softmax(scores) self.attn = masked_fill(self.attn, mask, 0.0) else: - self.attn = softmax(scores) # (batch, head, time1, time2) - # (batch, head, time1, time2) + # (batch, head, time1, time2) + self.attn = softmax(scores) + # (batch, head, time1, time2) p_attn = self.dropout(self.attn) # (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k) x = paddle.matmul(p_attn, value) @@ -130,16 +134,21 @@ class MultiHeadedAttention(nn.Layer): def forward(self, query, key, value, mask=None): """Compute scaled dot product attention. - Args: - query (paddle.Tensor): Query tensor (#batch, time1, size). - key (paddle.Tensor): Key tensor (#batch, time2, size). - value (paddle.Tensor): Value tensor (#batch, time2, size). - mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - paddle.Tensor: Output tensor (#batch, time1, d_model). + Parameters + ---------- + query : paddle.Tensor + Query tensor (#batch, time1, size). + key : paddle.Tensor + Key tensor (#batch, time2, size). + value : paddle.Tensor + Value tensor (#batch, time2, size). + mask : paddle.Tensor + Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) scores = paddle.matmul(q, k.transpose( diff --git a/parakeet/modules/fastspeech2_transformer/embedding.py b/parakeet/modules/fastspeech2_transformer/embedding.py index 8019cd3..51a4c1b 100644 --- a/parakeet/modules/fastspeech2_transformer/embedding.py +++ b/parakeet/modules/fastspeech2_transformer/embedding.py @@ -22,14 +22,16 @@ from paddle import nn class PositionalEncoding(nn.Layer): """Positional encoding. - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - reverse (bool): Whether to reverse the input position. Only for - the class LegacyRelPositionalEncoding. We remove it in the current - class RelPositionalEncoding. - + Parameters + ---------- + d_model : int + Embedding dimension. + dropout_rate : float + Dropout rate. + max_len : int + Maximum input length. + reverse : bool + Whether to reverse the input position. Only for """ def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): @@ -47,7 +49,6 @@ class PositionalEncoding(nn.Layer): pe = paddle.zeros([x.shape[1], self.d_model]) if self.reverse: - # (x.shape[1],1) position = paddle.arange( x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1) else: @@ -65,12 +66,15 @@ class PositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Args: - x (paddle.Tensor): Input tensor (batch, time, `*`). - - Returns: - paddle.Tensor: Encoded tensor (batch, time, `*`). + Parameters + ---------- + x : paddle.Tensor + Input tensor (batch, time, `*`). + Returns + ---------- + paddle.Tensor + Encoded tensor (batch, time, `*`). """ self.extend_pe(x) x = x * self.xscale + self.pe[:, :x.shape[1]] @@ -82,11 +86,14 @@ class ScaledPositionalEncoding(PositionalEncoding): See Sec. 3.2 https://arxiv.org/abs/1809.08895 - Args: - d_model (int): Embedding dimension. - dropout_rate (float): Dropout rate. - max_len (int): Maximum input length. - + Parameters + ---------- + d_model : int + Embedding dimension. + dropout_rate : float + Dropout rate. + max_len : int + Maximum input length. """ def __init__(self, d_model, dropout_rate, max_len=5000): @@ -106,12 +113,15 @@ class ScaledPositionalEncoding(PositionalEncoding): def forward(self, x): """Add positional encoding. - Args: - x (paddle.Tensor): Input tensor (batch, time, `*`). - - Returns: - paddle.Tensor: Encoded tensor (batch, time, `*`). + Parameters + ---------- + x : paddle.Tensor + Input tensor (batch, time, `*`). + Returns + ---------- + paddle.Tensor + Encoded tensor (batch, time, `*`). """ self.extend_pe(x) x = x + self.alpha * self.pe[:, :x.shape[1]] diff --git a/parakeet/modules/fastspeech2_transformer/encoder.py b/parakeet/modules/fastspeech2_transformer/encoder.py index f07083e..0288ab4 100644 --- a/parakeet/modules/fastspeech2_transformer/encoder.py +++ b/parakeet/modules/fastspeech2_transformer/encoder.py @@ -12,19 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math - -import numpy import logging -import paddle + from paddle import nn -from paddle.nn import functional as F -from paddle.nn import initializer as I -from paddle.fluid.layers import sequence_mask -import sys from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention -from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward @@ -35,28 +27,44 @@ from parakeet.modules.fastspeech2_transformer.repeat import repeat class Encoder(nn.Layer): """Transformer encoder module. - Args: - idim (int): Input dimension. - attention_dim (int): Dimention of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - attention_dropout_rate (float): Dropout rate in attention. - input_layer (Union[str, paddle.nn.Layer]): Input layer type. - pos_enc_class (paddle.nn.Layer): Positional encoding module class. + Parameters + ---------- + idim : int + Input dimension. + attention_dim : int + Dimention of attention. + attention_heads : int + The number of heads of multi head attention. + linear_units : int + The number of units of position-wise feed forward. + num_blocks : int + The number of decoder blocks. + dropout_rate : float + Dropout rate. + positional_dropout_rate : float + Dropout rate after adding positional encoding. + attention_dropout_rate : float + Dropout rate in attention. + input_layer : Union[str, paddle.nn.Layer] + Input layer type. + pos_enc_class : paddle.nn.Layer + Positional encoding module class. `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + normalize_before : bool + Whether to use layer_norm before the first block. + concat_after : bool + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - selfattention_layer_type (str): Encoder attention layer type. - padding_idx (int): Padding idx for input_layer=embed. - + positionwise_layer_type : str + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size : int + Kernel size of positionwise conv1d layer. + selfattention_layer_type : str + Encoder attention layer type. + padding_idx : int + Padding idx for input_layer=embed. """ def __init__( @@ -82,7 +90,8 @@ class Encoder(nn.Layer): self.conv_subsampling_factor = 1 if input_layer == "linear": self.embed = nn.Sequential( - nn.Linear(idim, attention_dim), + nn.Linear( + idim, attention_dim, bias_attr=True), nn.LayerNorm(attention_dim), nn.Dropout(dropout_rate), nn.ReLU(), @@ -169,14 +178,19 @@ class Encoder(nn.Layer): def forward(self, xs, masks): """Encode input sequence. - Args: - xs (paddle.Tensor): Input tensor (#batch, time, idim). - masks (paddle.Tensor): Mask tensor (#batch, time). - - Returns: - paddle.Tensor: Output tensor (#batch, time, attention_dim). - paddle.Tensor: Mask tensor (#batch, time). + Parameters + ---------- + xs : paddle.Tensor + Input tensor (#batch, time, idim). + masks : paddle.Tensor + Mask tensor (#batch, time). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time, attention_dim). + paddle.Tensor + Mask tensor (#batch, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -187,16 +201,23 @@ class Encoder(nn.Layer): def forward_one_step(self, xs, masks, cache=None): """Encode input frame. - Args: - xs (paddle.Tensor): Input tensor. - masks (paddle.Tensor): Mask tensor. - cache (List[paddle.Tensor]): List of cache tensors. - - Returns: - paddle.Tensor: Output tensor. - paddle.Tensor: Mask tensor. - List[paddle.Tensor]: List of new cache tensors. + Parameters + ---------- + xs : paddle.Tensor + Input tensor. + masks : paddle.Tensor + Mask tensor. + cache : List[paddle.Tensor] + List of cache tensors. + Returns + ---------- + paddle.Tensor + Output tensor. + paddle.Tensor + Mask tensor. + List[paddle.Tensor] + List of new cache tensors. """ xs = self.embed(xs) diff --git a/parakeet/modules/fastspeech2_transformer/encoder_layer.py b/parakeet/modules/fastspeech2_transformer/encoder_layer.py index d8c1f5e..e416348 100644 --- a/parakeet/modules/fastspeech2_transformer/encoder_layer.py +++ b/parakeet/modules/fastspeech2_transformer/encoder_layer.py @@ -14,28 +14,31 @@ """Encoder self-attention layer definition.""" import paddle - from paddle import nn class EncoderLayer(nn.Layer): """Encoder layer module. - Args: - size (int): Input dimension. - self_attn (paddle.nn.Layer): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance - can be used as the argument. - feed_forward (paddle.nn.Layer): Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. + Parameters + ---------- + size : int + Input dimension. + self_attn : paddle.nn.Layer + Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward : paddle.nn.Layer + Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate : float + Dropout rate. + normalize_before : bool + Whether to use layer_norm before the first block. + concat_after : bool + Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) - """ def __init__( @@ -57,20 +60,26 @@ class EncoderLayer(nn.Layer): self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) + self.concat_linear = nn.Linear(size + size, size, bias_attr=True) def forward(self, x, mask, cache=None): """Compute encoded features. - Args: - x_input (paddle.Tensor): Input tensor (#batch, time, size). - mask (paddle.Tensor): Mask tensor for the input (#batch, time). - cache (paddle.Tensor): Cache tensor of the input (#batch, time - 1, size). - - Returns: - paddle.Tensor: Output tensor (#batch, time, size). - paddle.Tensor: Mask tensor (#batch, time). + Parameters + ---------- + x_input : paddle.Tensor + Input tensor (#batch, time, size). + mask : paddle.Tensor + Mask tensor for the input (#batch, time). + cache : paddle.Tensor + Cache tensor of the input (#batch, time - 1, size). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time, size). + paddle.Tensor + Mask tensor (#batch, time). """ residual = x if self.normalize_before: @@ -82,7 +91,6 @@ class EncoderLayer(nn.Layer): assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) x_q = x[:, -1:, :] residual = residual[:, -1:, :] - # non-pad mask 变成 pad mask mask = None if mask is None else mask[:, -1:, :] if self.concat_after: @@ -90,6 +98,7 @@ class EncoderLayer(nn.Layer): (x, self.self_attn(x_q, x, x, mask)), axis=-1) x = residual + self.concat_linear(x_concat) else: + x = residual + self.dropout(self.self_attn(x_q, x, x, mask)) if not self.normalize_before: x = self.norm1(x) diff --git a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py index 38a2535..cde2168 100644 --- a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py +++ b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py @@ -32,11 +32,16 @@ class MultiLayeredConv1d(paddle.nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize MultiLayeredConv1d module. - Args: - in_chans (int): Number of input channels. - hidden_chans (int): Number of hidden channels. - kernel_size (int): Kernel size of conv1d. - dropout_rate (float): Dropout rate. + Parameters + ---------- + in_chans : int + Number of input channels. + hidden_chans : int + Number of hidden channels. + kernel_size : int + Kernel size of conv1d. + dropout_rate : float + Dropout rate. """ super(MultiLayeredConv1d, self).__init__() @@ -58,14 +63,16 @@ class MultiLayeredConv1d(paddle.nn.Layer): def forward(self, x): """Calculate forward propagation. - Args: - x (paddle.Tensor): Batch of input tensors (B, T, in_chans). - - Returns: - paddle.Tensor: Batch of output tensors (B, T, in_chans). + Parameters + ---------- + x : paddle.Tensor + Batch of input tensors (B, T, in_chans). + Returns + ---------- + paddle.Tensor + Batch of output tensors (B, T, in_chans). """ - # x = paddle.nn.ReLU(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( [0, 2, 1]) @@ -81,12 +88,16 @@ class Conv1dLinear(paddle.nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize Conv1dLinear module. - Args: - in_chans (int): Number of input channels. - hidden_chans (int): Number of hidden channels. - kernel_size (int): Kernel size of conv1d. - dropout_rate (float): Dropout rate. - + Parameters + ---------- + in_chans : int + Number of input channels. + hidden_chans : int + Number of hidden channels. + kernel_size : int + Kernel size of conv1d. + dropout_rate : float + Dropout rate. """ super(Conv1dLinear, self).__init__() self.w_1 = paddle.nn.Conv1D( @@ -95,18 +106,22 @@ class Conv1dLinear(paddle.nn.Layer): kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) - self.w_2 = paddle.nn.Linear(hidden_chans, in_chans) + self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True) self.dropout = paddle.nn.Dropout(dropout_rate) self.relu = paddle.nn.ReLU() def forward(self, x): """Calculate forward propagation. - Args: - x (paddle.Tensor): Batch of input tensors (B, T, in_chans). + Parameters + ---------- + x : paddle.Tensor + Batch of input tensors (B, T, in_chans). - Returns: - paddle.Tensor: Batch of output tensors (B, T, in_chans). + Returns + ---------- + paddle.Tensor + Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) diff --git a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py index 5283185..930d496 100644 --- a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py +++ b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py @@ -19,11 +19,14 @@ import paddle class PositionwiseFeedForward(paddle.nn.Layer): """Positionwise feed forward layer. - Args: - idim (int): Input dimenstion. - hidden_units (int): The number of hidden units. - dropout_rate (float): Dropout rate. - + Parameters + ---------- + idim : int + Input dimenstion. + hidden_units : int + The number of hidden units. + dropout_rate : float + Dropout rate. """ def __init__(self, @@ -33,8 +36,8 @@ class PositionwiseFeedForward(paddle.nn.Layer): activation=paddle.nn.ReLU()): """Construct an PositionwiseFeedForward object.""" super(PositionwiseFeedForward, self).__init__() - self.w_1 = paddle.nn.Linear(idim, hidden_units) - self.w_2 = paddle.nn.Linear(hidden_units, idim) + self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True) + self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True) self.dropout = paddle.nn.Dropout(dropout_rate) self.activation = activation diff --git a/parakeet/modules/fastspeech2_transformer/repeat.py b/parakeet/modules/fastspeech2_transformer/repeat.py index 646e164..62b21de 100644 --- a/parakeet/modules/fastspeech2_transformer/repeat.py +++ b/parakeet/modules/fastspeech2_transformer/repeat.py @@ -29,12 +29,16 @@ class MultiSequential(paddle.nn.Sequential): def repeat(N, fn): """Repeat module N times. - Args: - N (int): Number of repeat time. - fn (Callable): Function to generate module. - - Returns: - MultiSequential: Repeated model instance. + Parameters + ---------- + N : int + Number of repeat time. + fn : Callable + Function to generate module. + Returns + ---------- + MultiSequential + Repeated model instance. """ return MultiSequential(* [fn(n) for n in range(N)]) diff --git a/parakeet/modules/layer_norm.py b/parakeet/modules/layer_norm.py index 3f44652..5a9fe4e 100644 --- a/parakeet/modules/layer_norm.py +++ b/parakeet/modules/layer_norm.py @@ -19,10 +19,12 @@ import paddle class LayerNorm(paddle.nn.LayerNorm): """Layer normalization module. - Args: - nout (int): Output dim size. - dim (int): Dimension to be normalized. - + Parameters + ---------- + nout : int + Output dim size. + dim : int + Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -33,12 +35,15 @@ class LayerNorm(paddle.nn.LayerNorm): def forward(self, x): """Apply layer normalization. - Args: - x (torch.Tensor): Input tensor. - - Returns: - torch.Tensor: Normalized tensor. + Parameters + ---------- + x : paddle.Tensor + Input tensor. + Returns + ---------- + paddle.Tensor + Normalized tensor. """ if self.dim == -1: return super(LayerNorm, self).forward(x) diff --git a/parakeet/modules/masked_fill.py b/parakeet/modules/masked_fill.py index 5c751dd..4ca9826 100644 --- a/parakeet/modules/masked_fill.py +++ b/parakeet/modules/masked_fill.py @@ -28,7 +28,7 @@ def is_broadcastable(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - # assert is_broadcastable(xs.shape, mask.shape) is True + assert is_broadcastable(xs.shape, mask.shape) is True bshape = paddle.broadcast_shape(xs.shape, mask.shape) mask = mask.broadcast_to(bshape) trues = paddle.ones_like(xs) * value diff --git a/parakeet/modules/nets_utils.py b/parakeet/modules/nets_utils.py index 3cf3e4a..d218f62 100644 --- a/parakeet/modules/nets_utils.py +++ b/parakeet/modules/nets_utils.py @@ -13,20 +13,27 @@ # limitations under the License. import paddle +from paddle import nn +from typeguard import check_argument_types -# 按照这个 batch 里面最长的补零 def pad_list(xs, pad_value): """Perform padding for the list of tensors. - Args: - xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value (float): Value for padding. + Parameters + ---------- + xs : List[Tensor] + List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value : float) + Value for padding. - Returns: - Tensor: Padded tensor (B, Tmax, `*`). + Returns + ---------- + Tensor + Padded tensor (B, Tmax, `*`). - Examples: + Examples + ---------- >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] >>> x [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] @@ -34,11 +41,9 @@ def pad_list(xs, pad_value): tensor([[1., 1., 1., 1.], [1., 1., 0., 0.], [1., 0., 0., 0.]]) - """ n_batch = len(xs) max_len = max(x.shape[0] for x in xs) - # pad = xs[0].new(n_batch, max_len, *xs[0].shape[1:]).fill_(pad_value) pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value) for i in range(n_batch): @@ -50,13 +55,18 @@ def pad_list(xs, pad_value): def make_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of padded part. - Args: - lengths (LongTensor or List): Batch of lengths (B,). + Parameters + ---------- + lengths : LongTensor or List + Batch of lengths (B,). - Returns: - Tensor: Mask tensor containing indices of padded part bool. + Returns + ---------- + Tensor(bool) + Mask tensor containing indices of padded part bool. - Examples: + Examples + ---------- With only lengths. >>> lengths = [5, 3, 2] @@ -64,7 +74,6 @@ def make_pad_mask(lengths, length_dim=-1): masks = [[0, 0, 0, 0 ,0], [0, 0, 0, 1, 1], [0, 0, 1, 1, 1]] - """ if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) @@ -88,17 +97,24 @@ def make_pad_mask(lengths, length_dim=-1): def make_non_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of non-padded part. - Args: - lengths (LongTensor or List): Batch of lengths (B,). - xs (Tensor, optional): The reference tensor. + Parameters + ---------- + lengths : LongTensor or List + Batch of lengths (B,). + xs : Tensor, optional + The reference tensor. If set, masks will be the same shape as this tensor. - length_dim (int, optional): Dimension indicator of the above tensor. + length_dim : int, optional + Dimension indicator of the above tensor. See the example. - Returns: - ByteTensor: mask tensor containing indices of padded part bool. + Returns + ---------- + Tensor(bool) + mask tensor containing indices of padded part bool. - Examples: + Examples + ---------- With only lengths. >>> lengths = [5, 3, 2] @@ -106,6 +122,37 @@ def make_non_pad_mask(lengths, length_dim=-1): masks = [[1, 1, 1, 1 ,1], [1, 1, 1, 0, 0], [1, 1, 0, 0, 0]] - """ return paddle.logical_not(make_pad_mask(lengths, length_dim)) + + +def initialize(model: nn.Layer, init: str): + """Initialize weights of a neural network module. + + Parameters are initialized using the given method or distribution. + + Custom initialization routines can be implemented into submodules + + Parameters + ---------- + model : paddle.nn.Layer + Target. + init : str + Method of initialization. + """ + assert check_argument_types() + + if init == "xavier_uniform": + nn.initializer.set_global_initializer(nn.initializer.XavierUniform(), + nn.initializer.Constant()) + elif init == "xavier_normal": + nn.initializer.set_global_initializer(nn.initializer.XavierNormal(), + nn.initializer.Constant()) + elif init == "kaiming_uniform": + nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(), + nn.initializer.Constant()) + elif init == "kaiming_normal": + nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(), + nn.initializer.Constant()) + else: + raise ValueError("Unknown initialization: " + init)