format docstrings

2021-07-13 07:55:56 +00:00 · 2021-07-13 07:55:56 +00:00 · 6553d1d723
parent 3af3c29a94
commit 6553d1d723
15 changed files with 597 additions and 371 deletions
--- a/parakeet/models/fastspeech2_new.py
+++ b/parakeet/models/fastspeech2_new.py
@ -12,28 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fastspeech2 related modules for paddle"""
 import logging
 import numpy as np
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
 from typeguard import check_argument_types
 import paddle
 import numpy as np
 from paddle import nn
 from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
 from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
 from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
 from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
 from parakeet.modules.fastspeech2_predictor.postnet import Postnet
-from parakeet.modules.nets_utils import make_non_pad_mask
+from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
 from parakeet.modules.nets_utils import make_pad_mask
 from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from parakeet.modules.nets_utils import initialize
 from parakeet.modules.nets_utils import make_non_pad_mask
 from parakeet.modules.nets_utils import make_pad_mask
 class FastSpeech2(nn.Layer):
@ -155,7 +153,6 @@ class FastSpeech2(nn.Layer):
                positionwise_layer_type=positionwise_layer_type,
                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
        else:
            print("encoder_type:", encoder_type)
            raise ValueError(f"{encoder_type} is not supported.")
        # define duration predictor
@ -236,6 +233,12 @@ class FastSpeech2(nn.Layer):
            use_batch_norm=use_batch_norm,
            dropout_rate=postnet_dropout_rate, ))
        # initialize parameters
        self._reset_parameters(
            init_type=init_type,
            init_enc_alpha=init_enc_alpha,
            init_dec_alpha=init_dec_alpha, )
        # define criterions
        self.criterion = FastSpeech2Loss(
            use_masking=use_masking, use_weighted_masking=use_weighted_masking)
@ -253,25 +256,37 @@ class FastSpeech2(nn.Layer):
            energy: paddle.Tensor,
            energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[
                str, paddle.Tensor], paddle.Tensor]:
-        # """Calculate forward propagation.
+        """Calculate forward propagation.
-        # Args:
+        Parameters
-        #     text (LongTensor): Batch of padded token ids (B, Tmax).
+        ----------
-        #     text_lengths (LongTensor): Batch of lengths of each input (B,).
+            text : LongTensor
-        #     speech (Tensor): Batch of padded target features (B, Lmax, odim).
+                Batch of padded token ids (B, Tmax).
-        #     speech_lengths (LongTensor): Batch of the lengths of each target (B,).
+            text_lengths : LongTensor)
-        #     durations (LongTensor): Batch of padded durations (B, Tmax + 1).
+                Batch of lengths of each input (B,).
-        #     durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1).
+            speech : Tensor
-        #     pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
+                Batch of padded target features (B, Lmax, odim).
-        #     pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1).
+            speech_lengths : LongTensor
-        #     energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
+                Batch of the lengths of each target (B,).
-        #     energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1).
+            durations : LongTensor
-        # Returns:
+                Batch of padded durations (B, Tmax + 1).
-        #     Tensor: Loss scalar value.
+            durations_lengths : LongTensor
-        #     Dict: Statistics to be monitored.
+                Batch of duration lengths (B, Tmax + 1).
-        #     Tensor: Weight value.
+            pitch : Tensor
-
+                Batch of padded token-averaged pitch (B, Tmax + 1, 1).
-        # """
+            pitch_lengths : LongTensor
                Batch of pitch lengths (B, Tmax + 1).
            energy : Tensor
                Batch of padded token-averaged energy (B, Tmax + 1, 1).
            energy_lengths : LongTensor
                Batch of energy lengths (B, Tmax + 1).
        Returns
        ----------
            Tensor
                Loss scalar value.
            Dict
                Statistics to be monitored.
        """
        text = text[:, :text_lengths.max()]  # for data-parallel
        speech = speech[:, :speech_lengths.max()]  # for data-parallel
        durations = durations[:, :durations_lengths.max()]  # for data-parallel
@ -282,16 +297,11 @@ class FastSpeech2(nn.Layer):
        # Add eos at the last of sequence
        # xs = F.pad(text, [0, 1], "constant", self.padding_idx)
        print("xs.shape in fastspeech2.py before:", text.shape, text)
        xs = np.pad(text.numpy(),
                    pad_width=((0, 0), (0, 1)),
                    mode="constant",
                    constant_values=self.padding_idx)
        xs = paddle.to_tensor(xs)
        print("xs.shape in fastspeech2.py end:", xs.shape, xs)
        # my_pad = nn.Pad1D(padding=[0, 1], mode="constant", value=self.padding_idx)
        # xs = my_pad(text)
        # 是否会数组越界？ xs 是否能取到 l -> 可以，因为上一步补充了一个 padding_idx，又变成了 eos
        for i, l in enumerate(text_lengths):
            xs[i, l] = self.eos
        ilens = text_lengths + 1
@ -302,23 +312,16 @@ class FastSpeech2(nn.Layer):
        # forward propagation
        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
            xs, ilens, ys, olens, ds, ps, es, is_inference=False)
        print("d_outs in paddle:", d_outs)
        print("p_outs in paddle:", p_outs)
        print("e_outs in paddle:", e_outs)
        # modify mod part of groundtruth
        if self.reduction_factor > 1:
            # 需要改
            olens = paddle.to_tensor([
                olen - olen % self.reduction_factor for olen in olens.numpy()
            ])
            max_olen = max(olens)
            ys = ys[:, :max_olen]
        # calculate loss
        if self.postnet is None:
            after_outs = None
        # calculate loss
        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
            after_outs=after_outs,
@ -363,9 +366,8 @@ class FastSpeech2(nn.Layer):
            alpha: float=1.0, ) -> Sequence[paddle.Tensor]:
        # forward encoder
        x_masks = self._source_mask(ilens)
        print("xs.shape in fastspeech2.py:", xs.shape)
        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)
        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)
        # forward duration predictor and variance predictors
        d_masks = make_pad_mask(ilens)
@ -377,10 +379,11 @@ class FastSpeech2(nn.Layer):
            e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
        else:
            e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
-        print("p_outs.shape:", p_outs.shape)
+
        if is_inference:
            d_outs = self.duration_predictor.inference(hs,
                                                       d_masks)  # (B, Tmax)
            # print("d_outs:",d_outs)
            # use prediction in inference
            # (B, Tmax, 1)
@ -404,7 +407,6 @@ class FastSpeech2(nn.Layer):
        # forward decoder
        if olens is not None and not is_inference:
            if self.reduction_factor > 1:
                # 直接to_paddle ,维度会增加 1,需要先转成 numpy
                olens_in = paddle.to_tensor(
                    [olen // self.reduction_factor for olen in olens.numpy()])
            else:
@ -412,9 +414,10 @@ class FastSpeech2(nn.Layer):
            h_masks = self._source_mask(olens_in)
        else:
            h_masks = None
-        zs, _ = self.decoder(hs, h_masks)  # (B, Lmax, adim)
+        # (B, Lmax, adim)
-        before_outs = self.feat_out(zs).reshape(
+        zs, _ = self.decoder(hs, h_masks)
-            (zs.shape[0], -1, self.odim))  # (B, Lmax, odim)
+        # (B, Lmax, odim)
        before_outs = self.feat_out(zs).reshape((zs.shape[0], -1, self.odim))
        # postnet -> (B, Lmax//r * r, odim)
        if self.postnet is None:
@ -437,20 +440,30 @@ class FastSpeech2(nn.Layer):
                paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.
-        Args:
+        Parameters
-            text (LongTensor): Input sequence of characters (T,).
+        ----------
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
+            text : LongTensor
-            durations (LongTensor, optional): Groundtruth of duration (T + 1,).
+                Input sequence of characters (T,).
-            pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
+            speech : Tensor, optional
-            energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
+                Feature sequence to extract style (N, idim).
-            alpha (float, optional): Alpha to control the speed.
+            durations : LongTensor, optional
-            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
+                Groundtruth of duration (T + 1,).
-                If true, groundtruth of duration, pitch and energy will be used.
+            pitch : Tensor, optional
                Groundtruth of token-averaged pitch (T + 1, 1).
            energy : Tensor, optional
                Groundtruth of token-averaged energy (T + 1, 1).
            alpha : float, optional
                 Alpha to control the speed.
            use_teacher_forcing : bool, optional
                 Whether to use teacher forcing.
                 If true, groundtruth of duration, pitch and energy will be used.
-        Returns:
+        Returns
-            Tensor: Output sequence of features (L, odim).
+        ----------
-            None: Dummy for compatibility.
+            Tensor
-            None: Dummy for compatibility.
+                Output sequence of features (L, odim).
            None
                Dummy for compatibility.
        """
        x, y = text, speech
@ -460,13 +473,15 @@ class FastSpeech2(nn.Layer):
        x = np.pad(text.numpy(),
                   pad_width=((0, 1)),
                   mode="constant",
-                   constant_values=self.padding_idx)
+                   constant_values=self.eos)
        x = paddle.to_tensor(x)
        # setup batch axis
        ilens = paddle.to_tensor(
            [x.shape[0]], dtype=paddle.int64, place=x.place)
        xs, ys = x.unsqueeze(0), None
        if y is not None:
            ys = y.unsqueeze(0)
@ -493,14 +508,19 @@ class FastSpeech2(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.
-        Args:
+        Parameters
-            ilens (LongTensor): Batch of lengths (B,).
+        ----------
            ilens : LongTensor
                Batch of lengths (B,).
-        Returns:
+        Returns
-            Tensor: Mask tensor for self-attention.
+        -------
            Tensor
                Mask tensor for self-attention.
                dtype=paddle.bool
-        Examples:
+        Examples
        -------
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
@ -510,6 +530,29 @@ class FastSpeech2(nn.Layer):
        x_masks = make_non_pad_mask(ilens)
        return x_masks.unsqueeze(-2)
    def _reset_parameters(self,
                          init_type: str,
                          init_enc_alpha: float,
                          init_dec_alpha: float):
        # initialize parameters
        initialize(self, init_type)
        # initialize alpha in scaled positional encoding
        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
            init_enc_alpha = paddle.to_tensor(init_enc_alpha)
            self.encoder.embed[-1].alpha = paddle.create_parameter(
                shape=init_enc_alpha.shape,
                dtype=str(init_enc_alpha.numpy().dtype),
                default_initializer=paddle.nn.initializer.Assign(
                    init_enc_alpha))
        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
            init_dec_alpha = paddle.to_tensor(init_dec_alpha)
            self.decoder.embed[-1].alpha = paddle.create_parameter(
                shape=init_dec_alpha.shape,
                dtype=str(init_dec_alpha.numpy().dtype),
                default_initializer=paddle.nn.initializer.Assign(
                    init_dec_alpha))
 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""
@ -519,12 +562,12 @@ class FastSpeech2Loss(nn.Layer):
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
-        Args:
+        Parameters
-            use_masking (bool):
+        ----------
            use_masking : bool
                Whether to apply masking for padded part in loss calculation.
-            use_weighted_masking (bool):
+            use_weighted_masking : bool
                Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
@ -555,24 +598,41 @@ class FastSpeech2Loss(nn.Layer):
                                             paddle.Tensor, paddle.Tensor]:
        """Calculate forward propagation.
-        Args:
+        Parameters
-            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
+        ----------
-            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            after_outs : Tensor
-            d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
+                Batch of outputs after postnets (B, Lmax, odim).
-            p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+            before_outs : Tensor
-            e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+                Batch of outputs before postnets (B, Lmax, odim).
-            ys (Tensor): Batch of target features (B, Lmax, odim).
+            d_outs : LongTensor
-            ds (LongTensor): Batch of durations (B, Tmax).
+                 Batch of outputs of duration predictor (B, Tmax).
-            ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+            p_outs : Tensor
-            es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+                Batch of outputs of pitch predictor (B, Tmax, 1).
-            ilens (LongTensor): Batch of the lengths of each input (B,).
+            e_outs : Tensor
-            olens (LongTensor): Batch of the lengths of each target (B,).
+                Batch of outputs of energy predictor (B, Tmax, 1).
            ys : Tensor
                Batch of target features (B, Lmax, odim).
            ds : LongTensor
                Batch of durations (B, Tmax).
            ps : Tensor
                Batch of target token-averaged pitch (B, Tmax, 1).
            es : Tensor
                Batch of target token-averaged energy (B, Tmax, 1).
            ilens : LongTensor
                Batch of the lengths of each input (B,).
            olens : LongTensor
                Batch of the lengths of each target (B,).
-        Returns:
+        Returns
-            Tensor: L1 loss value.
+        ----------
-            Tensor: Duration predictor loss value.
+            Tensor
-            Tensor: Pitch predictor loss value.
+                L1 loss value.
-            Tensor: Energy predictor loss value.
+            Tensor
                Duration predictor loss value.
            Tensor
                Pitch predictor loss value.
            Tensor
                Energy predictor loss value.
        """
        # apply mask to remove padded part
--- a/parakeet/modules/fastspeech2_predictor/duration_predictor.py
+++ b/parakeet/modules/fastspeech2_predictor/duration_predictor.py
@ -15,7 +15,6 @@
 import paddle
 from paddle import nn
 from parakeet.modules.layer_norm import LayerNorm
 from parakeet.modules.masked_fill import masked_fill
@ -31,7 +30,8 @@ class DurationPredictor(nn.Layer):
    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf
-    Note:
+    Note
    ----------
        The calculation domain of outputs is different
        between in `forward` and in `inference`. In `forward`,
        the outputs are calculated in log domain but in `inference`,
@ -48,13 +48,20 @@ class DurationPredictor(nn.Layer):
                 offset=1.0):
        """Initilize duration predictor module.
-        Args:
+        Parameters
-            idim (int): Input dimension.
+        ----------
-            n_layers (int, optional): Number of convolutional layers.
+            idim : int
-            n_chans (int, optional): Number of channels of convolutional layers.
+                Input dimension.
-            kernel_size (int, optional): Kernel size of convolutional layers.
+            n_layers : int, optional
-            dropout_rate (float, optional): Dropout rate.
+                 Number of convolutional layers.
-            offset (float, optional): Offset value to avoid nan in log domain.
+            n_chans : int, optional
                Number of channels of convolutional layers.
            kernel_size : int, optional
                Kernel size of convolutional layers.
            dropout_rate : float, optional
                 Dropout rate.
            offset : float, optional
                Offset value to avoid nan in log domain.
        """
        super(DurationPredictor, self).__init__()
@ -74,7 +81,7 @@ class DurationPredictor(nn.Layer):
                    LayerNorm(
                        n_chans, dim=1),
                    nn.Dropout(dropout_rate), ))
-        self.linear = nn.Linear(n_chans, 1)
+        self.linear = nn.Linear(n_chans, 1, bias_attr=True)
    def _forward(self, xs, x_masks=None, is_inference=False):
        # (B, idim, Tmax)
@ -83,7 +90,7 @@ class DurationPredictor(nn.Layer):
        for f in self.conv:
            xs = f(xs)
-            # NOTE: calculate in log domain
+        # NOTE: calculate in log domain
        # (B, Tmax)
        xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1)
@ -99,28 +106,34 @@ class DurationPredictor(nn.Layer):
    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.
-        Args:
+        Parameters
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+        ----------
-            x_masks (ByteTensor, optional):
+            xs : Tensor
                Batch of input sequences (B, Tmax, idim).
            x_masks : ByteTensor, optional
                Batch of masks indicating padded part (B, Tmax).
-        Returns:
+        Returns
-            Tensor: Batch of predicted durations in log domain (B, Tmax).
+        ----------
-
+            Tensor
                Batch of predicted durations in log domain (B, Tmax).
        """
        return self._forward(xs, x_masks, False)
    def inference(self, xs, x_masks=None):
        """Inference duration.
-        Args:
+        Parameters
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+        ----------
-            x_masks (ByteTensor, optional):
+            xs : Tensor
                Batch of input sequences (B, Tmax, idim).
            x_masks : Tensor(bool), optional
                Batch of masks indicating padded part (B, Tmax).
-        Returns:
+        Returns
-            LongTensor: Batch of predicted durations in linear domain int64 (B, Tmax).
+        ----------
-
+            LongTensor
                Batch of predicted durations in linear domain int64 (B, Tmax).
        """
        return self._forward(xs, x_masks, True)
@ -135,10 +148,12 @@ class DurationPredictorLoss(nn.Layer):
    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.
-        Args:
+        Parameters
-            offset (float, optional): Offset value to avoid nan in log domain.
+        ----------
-            reduction (str): Reduction type in loss calculation.
+            offset : float, optional
-
+                Offset value to avoid nan in log domain.
            reduction : str
                Reduction type in loss calculation.
        """
        super(DurationPredictorLoss, self).__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
@ -147,16 +162,21 @@ class DurationPredictorLoss(nn.Layer):
    def forward(self, outputs, targets):
        """Calculate forward propagation.
-        Args:
+        Parameters
-            outputs (Tensor): Batch of prediction durations in log domain (B, T)
+        ----------
-            targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
+            outputs : Tensor
                Batch of prediction durations in log domain (B, T)
            targets : LongTensor
                Batch of groundtruth durations in linear domain (B, T)
-        Returns:
+        Returns
-            Tensor: Mean squared error loss value.
+        ----------
            Tensor
                Mean squared error loss value.
-        Note:
+        Note
        ----------
            `outputs` is in log domain but `targets` is in linear domain.
        """
        # NOTE: outputs is in log domain while targets in linear
        targets = paddle.log(targets.cast(dtype='float32') + self.offset)
--- a/parakeet/modules/fastspeech2_predictor/length_regulator.py
+++ b/parakeet/modules/fastspeech2_predictor/length_regulator.py
@ -13,8 +13,6 @@
 # limitations under the License.
 """Length regulator related modules."""
 import logging
 import numpy as np
 import paddle
 from paddle import nn
@ -37,8 +35,10 @@ class LengthRegulator(nn.Layer):
    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.
-        Args:
+        Parameters
-            pad_value (float, optional): Value used for padding.
+        ----------
            pad_value : float, optional
                Value used for padding.
        """
        super().__init__()
@ -68,14 +68,19 @@ class LengthRegulator(nn.Layer):
    def forward(self, xs, ds, alpha=1.0):
        """Calculate forward propagation.
-        Args:
+        Parameters
-            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+        ----------
-            ds (LongTensor): Batch of durations of each frame (B, T).
+            xs : Tensor
-            alpha (float, optional): Alpha value to control speed of speech.
+                Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-
+            ds : LongTensor
-        Returns:
+                 Batch of durations of each frame (B, T).
-            Tensor: replicated input tensor based on durations (B, T*, D).
+            alpha : float, optional
                Alpha value to control speed of speech.
        Returns
        ----------
            Tensor
                replicated input tensor based on durations (B, T*, D).
        """
        if alpha != 1.0:
            assert alpha > 0
--- a/parakeet/modules/fastspeech2_predictor/postnet.py
+++ b/parakeet/modules/fastspeech2_predictor/postnet.py
@ -43,15 +43,22 @@ class Postnet(nn.Layer):
            use_batch_norm=True, ):
        """Initialize postnet module.
-        Args:
+        Parameters
-            idim (int): Dimension of the inputs.
+        ----------
-            odim (int): Dimension of the outputs.
+            idim : int
-            n_layers (int, optional): The number of layers.
+                Dimension of the inputs.
-            n_filts (int, optional): The number of filter size.
+            odim : int
-            n_units (int, optional): The number of filter channels.
+                Dimension of the outputs.
-            use_batch_norm (bool, optional): Whether to use batch normalization..
+            n_layers : int, optional
-            dropout_rate (float, optional): Dropout rate..
+                The number of layers.
-
+            n_filts : int, optional
                The number of filter size.
            n_units : int, optional
                The number of filter channels.
            use_batch_norm : bool, optional
                Whether to use batch normalization..
            dropout_rate : float, optional
                Dropout rate..
        """
        super(Postnet, self).__init__()
        self.postnet = nn.LayerList()
@ -111,11 +118,15 @@ class Postnet(nn.Layer):
    def forward(self, xs):
        """Calculate forward propagation.
-        Args:
+        Parameters
-            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+        ----------
            xs : Tensor
                Batch of the sequences of padded input tensors (B, idim, Tmax).
-        Returns:
+        Returns
-            Tensor: Batch of padded output tensor. (B, odim, Tmax).
+        ----------
            Tensor
                Batch of padded output tensor. (B, odim, Tmax).
        """
        for i in six.moves.range(len(self.postnet)):
--- a/parakeet/modules/fastspeech2_predictor/variance_predictor.py
+++ b/parakeet/modules/fastspeech2_predictor/variance_predictor.py
@ -15,10 +15,8 @@
 import paddle
 from paddle import nn
 from parakeet.modules.layer_norm import LayerNorm
 from parakeet.modules.masked_fill import masked_fill
 from typeguard import check_argument_types
@ -43,13 +41,18 @@ class VariancePredictor(nn.Layer):
            dropout_rate: float=0.5, ):
        """Initilize duration predictor module.
-        Args:
+        Parameters
-            idim (int): Input dimension.
+        ----------
-            n_layers (int, optional): Number of convolutional layers.
+            idim : int
-            n_chans (int, optional): Number of channels of convolutional layers.
+                Input dimension.
-            kernel_size (int, optional): Kernel size of convolutional layers.
+            n_layers : int, optional
-            dropout_rate (float, optional): Dropout rate.
+                Number of convolutional layers.
-
+            n_chans : int, optional
                Number of channels of convolutional layers.
            kernel_size : int, optional
                Kernel size of convolutional layers.
            dropout_rate : float, optional
                Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -70,26 +73,30 @@ class VariancePredictor(nn.Layer):
                        n_chans, dim=1),
                    nn.Dropout(dropout_rate), ))
-        self.linear = nn.Linear(n_chans, 1)
+        self.linear = nn.Linear(n_chans, 1, bias_attr=True)
    def forward(self, xs: paddle.Tensor,
                x_masks: paddle.Tensor=None) -> paddle.Tensor:
        """Calculate forward propagation.
-        Args:
+        Parameters
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+        ----------
-            x_masks (ByteTensor, optional):
+            xs : Tensor
                Batch of input sequences (B, Tmax, idim).
            x_masks : Tensor(bool), optional
                Batch of masks indicating padded part (B, Tmax, 1).
-        Returns:
+        Returns
-            Tensor: Batch of predicted sequences (B, Tmax, 1).
+        ----------
-
+            Tensor
                Batch of predicted sequences (B, Tmax, 1).
        """
        # (B, idim, Tmax)
        xs = xs.transpose([0, 2, 1])
        # (B, C, Tmax)
        for f in self.conv:
-            xs = f(xs)  # (B, C, Tmax)
+            # (B, C, Tmax)
            xs = f(xs)
        # (B, Tmax, 1)
        xs = self.linear(xs.transpose([0, 2, 1]))
--- a/parakeet/modules/fastspeech2_transformer/attention.py
+++ b/parakeet/modules/fastspeech2_transformer/attention.py
@ -16,23 +16,22 @@
 import math
 import numpy
 import paddle
 from paddle import nn
 from paddle.fluid.layers import sequence_mask
 from parakeet.modules.masked_fill import masked_fill
 class MultiHeadedAttention(nn.Layer):
    """Multi-Head Attention layer.
-    Args:
+    Parameters
-        n_head (int): The number of heads.
+    ----------
-        n_feat (int): The number of features.
+        n_head : int
-        dropout_rate (float): Dropout rate.
+            The number of heads.
-
+        n_feat : int
            The number of features.
        dropout_rate : float
            Dropout rate.
    """
    def __init__(self, n_head, n_feat, dropout_rate):
@ -42,33 +41,42 @@ class MultiHeadedAttention(nn.Layer):
        # We assume d_v always equals d_k
        self.d_k = n_feat // n_head
        self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True)
-        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True)
-        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout_rate)
    def forward_qkv(self, query, key, value):
        """Transform query, key and value.
-        Args:
+        Parameters
-            query (paddle.Tensor): Query tensor (#batch, time1, size).
+        ----------
-            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            query : paddle.Tensor
-            value (paddle.Tensor): Value tensor (#batch, time2, size).
+                query tensor (#batch, time1, size).
-
+            key : paddle.Tensor
-        Returns:
+                Key tensor (#batch, time2, size).
-            paddle.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            value : paddle.Tensor
-            paddle.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+                Value tensor (#batch, time2, size).
            paddle.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
        Returns
        ----------
            paddle.Tensor
                Transformed query tensor (#batch, n_head, time1, d_k).
            paddle.Tensor
                Transformed key tensor (#batch, n_head, time2, d_k).
            paddle.Tensor
                Transformed value tensor (#batch, n_head, time2, d_k).
        """
        n_batch = query.shape[0]
        q = paddle.reshape(
            self.linear_q(query), [n_batch, -1, self.h, self.d_k])
        k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
        v = paddle.reshape(
            self.linear_v(value), [n_batch, -1, self.h, self.d_k])
        # (batch, head, time1, d_k)
        q = q.transpose((0, 2, 1, 3))
        # (batch, head, time2, d_k)
@ -80,44 +88,40 @@ class MultiHeadedAttention(nn.Layer):
    def forward_attention(self, value, scores, mask=None):
        """Compute attention context vector.
-        Args:
+        Parameters
-            value (paddle.Tensor): Transformed value (#batch, n_head, time2, d_k).
+        ----------
-            scores (paddle.Tensor): Attention score (#batch, n_head, time1, time2).
+            value : paddle.Tensor
-            mask (paddle.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+                Transformed value (#batch, n_head, time2, d_k).
            scores : paddle.Tensor
                Attention score (#batch, n_head, time1, time2).
            mask :  paddle.Tensor
                Mask (#batch, 1, time2) or (#batch, time1, time2).
-        Returns:
+        Returns
-            paddle.Tensor: Transformed value (#batch, time1, d_model)
+        ----------
            paddle.Tensor:
                Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).
        """
        n_batch = value.shape[0]
        softmax = paddle.nn.Softmax(axis=-1)
        if mask is not None:
            mask = mask.unsqueeze(1)
            # mask 取反, pad 的位置变成 true，之后 pad 的位置被替换为 0
            mask = paddle.logical_not(mask)
            # mask = paddle.cast(mask, dtype='int64')
            # mask ==1 的位置用 min_value 代替
            # scores = scores.masked_fill(mask, min_value)
            min_value = float(
                numpy.finfo(
                    paddle.to_tensor(
                        0, dtype=scores.dtype).numpy().dtype).min)
            scores = masked_fill(scores, mask, min_value)
-            self.attn = softmax(scores)  # (batch, head, time1, time2)
+            # (batch, head, time1, time2)
-
+            self.attn = softmax(scores)
            # 用value填充tensor中与mask中值为1位置相对应的元素 == 保留 mask 为0 的值
            #  self.attn = torch.softmax(scores, dim=-1).masked_fill(
            #     mask, 0.0
            # )  # (batch, head, time1, time2)
            # 保留 mask 为 0 的位置，其他变成 0
            self.attn = masked_fill(self.attn, mask, 0.0)
        else:
-            self.attn = softmax(scores)  # (batch, head, time1, time2)
+            # (batch, head, time1, time2)
-        # (batch, head, time1, time2)
+            self.attn = softmax(scores)
            # (batch, head, time1, time2)
        p_attn = self.dropout(self.attn)
        # (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
        x = paddle.matmul(p_attn, value)
@ -130,16 +134,21 @@ class MultiHeadedAttention(nn.Layer):
    def forward(self, query, key, value, mask=None):
        """Compute scaled dot product attention.
-        Args:
+        Parameters
-            query (paddle.Tensor): Query tensor (#batch, time1, size).
+        ----------
-            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            query : paddle.Tensor
-            value (paddle.Tensor): Value tensor (#batch, time2, size).
+                Query tensor (#batch, time1, size).
-            mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
+            key : paddle.Tensor
-                (#batch, time1, time2).
+                Key tensor (#batch, time2, size).
-
+            value : paddle.Tensor
-        Returns:
+                Value tensor (#batch, time2, size).
-            paddle.Tensor: Output tensor (#batch, time1, d_model).
+            mask : paddle.Tensor
                Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
        Returns
        ----------
            paddle.Tensor
                Output tensor (#batch, time1, d_model).
        """
        q, k, v = self.forward_qkv(query, key, value)
        scores = paddle.matmul(q, k.transpose(
--- a/parakeet/modules/fastspeech2_transformer/embedding.py
+++ b/parakeet/modules/fastspeech2_transformer/embedding.py
@ -22,14 +22,16 @@ from paddle import nn
 class PositionalEncoding(nn.Layer):
    """Positional encoding.
-    Args:
+    Parameters
-        d_model (int): Embedding dimension.
+    ----------
-        dropout_rate (float): Dropout rate.
+        d_model : int
-        max_len (int): Maximum input length.
+            Embedding dimension.
-        reverse (bool): Whether to reverse the input position. Only for
+        dropout_rate : float
-        the class LegacyRelPositionalEncoding. We remove it in the current
+            Dropout rate.
-        class RelPositionalEncoding.
+        max_len : int
-
+            Maximum input length.
        reverse : bool
            Whether to reverse the input position. Only for
    """
    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
@ -47,7 +49,6 @@ class PositionalEncoding(nn.Layer):
        pe = paddle.zeros([x.shape[1], self.d_model])
        if self.reverse:
            # (x.shape[1],1)
            position = paddle.arange(
                x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1)
        else:
@ -65,12 +66,15 @@ class PositionalEncoding(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
-        Args:
+        Parameters
-            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        ----------
-
+            x : paddle.Tensor
-        Returns:
+                Input tensor (batch, time, `*`).
            paddle.Tensor: Encoded tensor (batch, time, `*`).
        Returns
        ----------
            paddle.Tensor
                Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        x = x * self.xscale + self.pe[:, :x.shape[1]]
@ -82,11 +86,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
    See Sec. 3.2  https://arxiv.org/abs/1809.08895
-    Args:
+    Parameters
-        d_model (int): Embedding dimension.
+    ----------
-        dropout_rate (float): Dropout rate.
+        d_model : int
-        max_len (int): Maximum input length.
+            Embedding dimension.
-
+        dropout_rate : float
            Dropout rate.
        max_len : int
            Maximum input length.
    """
    def __init__(self, d_model, dropout_rate, max_len=5000):
@ -106,12 +113,15 @@ class ScaledPositionalEncoding(PositionalEncoding):
    def forward(self, x):
        """Add positional encoding.
-        Args:
+        Parameters
-            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        ----------
-
+            x : paddle.Tensor
-        Returns:
+                Input tensor (batch, time, `*`).
            paddle.Tensor: Encoded tensor (batch, time, `*`).
        Returns
        ----------
            paddle.Tensor
                Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        x = x + self.alpha * self.pe[:, :x.shape[1]]
--- a/parakeet/modules/fastspeech2_transformer/encoder.py
+++ b/parakeet/modules/fastspeech2_transformer/encoder.py
@ -12,19 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy
 import logging
-import paddle
+
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from paddle.fluid.layers import sequence_mask
 import sys
 from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
 from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
 from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
 from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
 from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
@ -35,28 +27,44 @@ from parakeet.modules.fastspeech2_transformer.repeat import repeat
 class Encoder(nn.Layer):
    """Transformer encoder module.
-    Args:
+    Parameters
-        idim (int): Input dimension.
+    ----------
-        attention_dim (int): Dimention of attention.
+        idim : int
-        attention_heads (int): The number of heads of multi head attention.
+            Input dimension.
-        linear_units (int): The number of units of position-wise feed forward.
+        attention_dim : int
-        num_blocks (int): The number of decoder blocks.
+            Dimention of attention.
-        dropout_rate (float): Dropout rate.
+        attention_heads : int
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+            The number of heads of multi head attention.
-        attention_dropout_rate (float): Dropout rate in attention.
+        linear_units : int
-        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+            The number of units of position-wise feed forward.
-        pos_enc_class (paddle.nn.Layer): Positional encoding module class.
+        num_blocks : int
            The number of decoder blocks.
        dropout_rate : float
            Dropout rate.
        positional_dropout_rate : float
            Dropout rate after adding positional encoding.
        attention_dropout_rate : float
            Dropout rate in attention.
        input_layer : Union[str, paddle.nn.Layer]
            Input layer type.
        pos_enc_class : paddle.nn.Layer
            Positional encoding module class.
            `PositionalEncoding `or `ScaledPositionalEncoding`
-        normalize_before (bool): Whether to use layer_norm before the first block.
+        normalize_before : bool
-        concat_after (bool): Whether to concat attention layer's input and output.
+            Whether to use layer_norm before the first block.
        concat_after : bool
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_layer_type : str
-        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+            "linear", "conv1d", or "conv1d-linear".
-        selfattention_layer_type (str): Encoder attention layer type.
+        positionwise_conv_kernel_size : int
-        padding_idx (int): Padding idx for input_layer=embed.
+            Kernel size of positionwise conv1d layer.
-
+        selfattention_layer_type : str
            Encoder attention layer type.
        padding_idx : int
            Padding idx for input_layer=embed.
    """
    def __init__(
@ -82,7 +90,8 @@ class Encoder(nn.Layer):
        self.conv_subsampling_factor = 1
        if input_layer == "linear":
            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim),
+                nn.Linear(
                    idim, attention_dim, bias_attr=True),
                nn.LayerNorm(attention_dim),
                nn.Dropout(dropout_rate),
                nn.ReLU(),
@ -169,14 +178,19 @@ class Encoder(nn.Layer):
    def forward(self, xs, masks):
        """Encode input sequence.
-        Args:
+        Parameters
-            xs (paddle.Tensor): Input tensor (#batch, time, idim).
+        ----------
-            masks (paddle.Tensor): Mask tensor (#batch, time).
+            xs : paddle.Tensor
-
+                Input tensor (#batch, time, idim).
-        Returns:
+            masks : paddle.Tensor
-            paddle.Tensor: Output tensor (#batch, time, attention_dim).
+                Mask tensor (#batch, time).
            paddle.Tensor: Mask tensor (#batch, time).
        Returns
        ----------
            paddle.Tensor
                Output tensor (#batch, time, attention_dim).
            paddle.Tensor
                Mask tensor (#batch, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -187,16 +201,23 @@ class Encoder(nn.Layer):
    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.
-        Args:
+        Parameters
-            xs (paddle.Tensor): Input tensor.
+        ----------
-            masks (paddle.Tensor): Mask tensor.
+            xs : paddle.Tensor
-            cache (List[paddle.Tensor]): List of cache tensors.
+                Input tensor.
-
+            masks : paddle.Tensor
-        Returns:
+                Mask tensor.
-            paddle.Tensor: Output tensor.
+            cache : List[paddle.Tensor]
-            paddle.Tensor: Mask tensor.
+                 List of cache tensors.
            List[paddle.Tensor]: List of new cache tensors.
        Returns
        ----------
            paddle.Tensor
                Output tensor.
            paddle.Tensor
                Mask tensor.
            List[paddle.Tensor]
                List of new cache tensors.
        """
        xs = self.embed(xs)
--- a/parakeet/modules/fastspeech2_transformer/encoder_layer.py
+++ b/parakeet/modules/fastspeech2_transformer/encoder_layer.py
@ -14,28 +14,31 @@
 """Encoder self-attention layer definition."""
 import paddle
 from paddle import nn
 class EncoderLayer(nn.Layer):
    """Encoder layer module.
-    Args:
+    Parameters
-        size (int): Input dimension.
+    ----------
-        self_attn (paddle.nn.Layer): Self-attention module instance.
+        size : int
-            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            Input dimension.
-            can be used as the argument.
+        self_attn : paddle.nn.Layer
-        feed_forward (paddle.nn.Layer): Feed-forward module instance.
+            Self-attention module instance.
-            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            `MultiHeadedAttention`  instance can be used as the argument.
-            can be used as the argument.
+        feed_forward : paddle.nn.Layer
-        dropout_rate (float): Dropout rate.
+            Feed-forward module instance.
-        normalize_before (bool): Whether to use layer_norm before the first block.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        dropout_rate : float
            Dropout rate.
        normalize_before : bool
            Whether to use layer_norm before the first block.
        concat_after : bool
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
    """
    def __init__(
@ -57,20 +60,26 @@ class EncoderLayer(nn.Layer):
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
-            self.concat_linear = nn.Linear(size + size, size)
+            self.concat_linear = nn.Linear(size + size, size, bias_attr=True)
    def forward(self, x, mask, cache=None):
        """Compute encoded features.
-        Args:
+        Parameters
-            x_input (paddle.Tensor): Input tensor (#batch, time, size).
+        ----------
-            mask (paddle.Tensor): Mask tensor for the input (#batch, time).
+            x_input : paddle.Tensor
-            cache (paddle.Tensor): Cache tensor of the input (#batch, time - 1, size).
+                Input tensor (#batch, time, size).
-
+            mask : paddle.Tensor
-        Returns:
+                Mask tensor for the input (#batch, time).
-            paddle.Tensor: Output tensor (#batch, time, size).
+            cache : paddle.Tensor
-            paddle.Tensor: Mask tensor (#batch, time).
+                 Cache tensor of the input (#batch, time - 1, size).
        Returns
        ----------
            paddle.Tensor
                Output tensor (#batch, time, size).
            paddle.Tensor
                Mask tensor (#batch, time).
        """
        residual = x
        if self.normalize_before:
@ -82,7 +91,6 @@ class EncoderLayer(nn.Layer):
            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
            x_q = x[:, -1:, :]
            residual = residual[:, -1:, :]
            # non-pad mask 变成 pad mask
            mask = None if mask is None else mask[:, -1:, :]
        if self.concat_after:
@ -90,6 +98,7 @@ class EncoderLayer(nn.Layer):
                (x, self.self_attn(x_q, x, x, mask)), axis=-1)
            x = residual + self.concat_linear(x_concat)
        else:
            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
        if not self.normalize_before:
            x = self.norm1(x)
--- a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py
@ -32,11 +32,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize MultiLayeredConv1d module.
-        Args:
+        Parameters
-            in_chans (int): Number of input channels.
+        ----------
-            hidden_chans (int): Number of hidden channels.
+            in_chans : int
-            kernel_size (int): Kernel size of conv1d.
+                Number of input channels.
-            dropout_rate (float): Dropout rate.
+            hidden_chans : int
                Number of hidden channels.
            kernel_size : int
                Kernel size of conv1d.
            dropout_rate : float
                Dropout rate.
        """
        super(MultiLayeredConv1d, self).__init__()
@ -58,14 +63,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Args:
+        Parameters
-            x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
+        ----------
-
+            x : paddle.Tensor
-        Returns:
+                Batch of input tensors (B, T, in_chans).
            paddle.Tensor: Batch of output tensors (B, T, in_chans).
        Returns
        ----------
            paddle.Tensor
                Batch of output tensors (B, T, in_chans).
        """
        # x = paddle.nn.ReLU(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
            [0, 2, 1])
@ -81,12 +88,16 @@ class Conv1dLinear(paddle.nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize Conv1dLinear module.
-        Args:
+        Parameters
-            in_chans (int): Number of input channels.
+        ----------
-            hidden_chans (int): Number of hidden channels.
+            in_chans : int
-            kernel_size (int): Kernel size of conv1d.
+                Number of input channels.
-            dropout_rate (float): Dropout rate.
+            hidden_chans : int
-
+                Number of hidden channels.
            kernel_size : int
                Kernel size of conv1d.
            dropout_rate : float
                Dropout rate.
        """
        super(Conv1dLinear, self).__init__()
        self.w_1 = paddle.nn.Conv1D(
@ -95,18 +106,22 @@ class Conv1dLinear(paddle.nn.Layer):
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans)
+        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
        self.dropout = paddle.nn.Dropout(dropout_rate)
        self.relu = paddle.nn.ReLU()
    def forward(self, x):
        """Calculate forward propagation.
-        Args:
+        Parameters
-            x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
+        ----------
            x : paddle.Tensor
            Batch of input tensors (B, T, in_chans).
-        Returns:
+        Returns
-            paddle.Tensor: Batch of output tensors (B, T, in_chans).
+        ----------
            paddle.Tensor
                Batch of output tensors (B, T, in_chans).
        """
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
--- a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py
@ -19,11 +19,14 @@ import paddle
 class PositionwiseFeedForward(paddle.nn.Layer):
    """Positionwise feed forward layer.
-    Args:
+    Parameters
-        idim (int): Input dimenstion.
+    ----------
-        hidden_units (int): The number of hidden units.
+        idim : int
-        dropout_rate (float): Dropout rate.
+            Input dimenstion.
-
+        hidden_units : int
            The number of hidden units.
        dropout_rate : float
            Dropout rate.
    """
    def __init__(self,
@ -33,8 +36,8 @@ class PositionwiseFeedForward(paddle.nn.Layer):
                 activation=paddle.nn.ReLU()):
        """Construct an PositionwiseFeedForward object."""
        super(PositionwiseFeedForward, self).__init__()
-        self.w_1 = paddle.nn.Linear(idim, hidden_units)
+        self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
-        self.w_2 = paddle.nn.Linear(hidden_units, idim)
+        self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
        self.dropout = paddle.nn.Dropout(dropout_rate)
        self.activation = activation
--- a/parakeet/modules/fastspeech2_transformer/repeat.py
+++ b/parakeet/modules/fastspeech2_transformer/repeat.py
@ -29,12 +29,16 @@ class MultiSequential(paddle.nn.Sequential):
 def repeat(N, fn):
    """Repeat module N times.
-    Args:
+    Parameters
-        N (int): Number of repeat time.
+    ----------
-        fn (Callable): Function to generate module.
+        N : int
-
+            Number of repeat time.
-    Returns:
+        fn : Callable
-        MultiSequential: Repeated model instance.
+            Function to generate module.
    Returns
    ----------
        MultiSequential
            Repeated model instance.
    """
    return MultiSequential(* [fn(n) for n in range(N)])
--- a/parakeet/modules/layer_norm.py
+++ b/parakeet/modules/layer_norm.py
@ -19,10 +19,12 @@ import paddle
 class LayerNorm(paddle.nn.LayerNorm):
    """Layer normalization module.
-    Args:
+    Parameters
-        nout (int): Output dim size.
+    ----------
-        dim (int): Dimension to be normalized.
+        nout : int
-
+            Output dim size.
        dim : int
            Dimension to be normalized.
    """
    def __init__(self, nout, dim=-1):
@ -33,12 +35,15 @@ class LayerNorm(paddle.nn.LayerNorm):
    def forward(self, x):
        """Apply layer normalization.
-        Args:
+        Parameters
-            x (torch.Tensor): Input tensor.
+        ----------
-
+            x : paddle.Tensor
-        Returns:
+                Input tensor.
            torch.Tensor: Normalized tensor.
        Returns
        ----------
            paddle.Tensor
                Normalized tensor.
        """
        if self.dim == -1:
            return super(LayerNorm, self).forward(x)
--- a/parakeet/modules/masked_fill.py
+++ b/parakeet/modules/masked_fill.py
@ -28,7 +28,7 @@ def is_broadcastable(shp1, shp2):
 def masked_fill(xs: paddle.Tensor,
                mask: paddle.Tensor,
                value: Union[float, int]):
-    # assert is_broadcastable(xs.shape, mask.shape) is True
+    assert is_broadcastable(xs.shape, mask.shape) is True
    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
    mask = mask.broadcast_to(bshape)
    trues = paddle.ones_like(xs) * value
--- a/parakeet/modules/nets_utils.py
+++ b/parakeet/modules/nets_utils.py
@ -13,20 +13,27 @@
 # limitations under the License.
 import paddle
 from paddle import nn
 from typeguard import check_argument_types
 # 按照这个 batch 里面最长的补零
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.
-    Args:
+    Parameters
-        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+    ----------
-        pad_value (float): Value for padding.
+        xs : List[Tensor]
            List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
        pad_value : float)
            Value for padding.
-    Returns:
+    Returns
-        Tensor: Padded tensor (B, Tmax, `*`).
+    ----------
        Tensor
            Padded tensor (B, Tmax, `*`).
-    Examples:
+    Examples
    ----------
        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
        >>> x
        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
@ -34,11 +41,9 @@ def pad_list(xs, pad_value):
        tensor([[1., 1., 1., 1.],
                [1., 1., 0., 0.],
                [1., 0., 0., 0.]])
    """
    n_batch = len(xs)
    max_len = max(x.shape[0] for x in xs)
    # pad = xs[0].new(n_batch, max_len, *xs[0].shape[1:]).fill_(pad_value)
    pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)
    for i in range(n_batch):
@ -50,13 +55,18 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of padded part.
-    Args:
+    Parameters
-        lengths (LongTensor or List): Batch of lengths (B,).
+    ----------
        lengths : LongTensor or List
             Batch of lengths (B,).
-    Returns:
+    Returns
-        Tensor: Mask tensor containing indices of padded part bool.
+    ----------
        Tensor(bool)
            Mask tensor containing indices of padded part bool.
-    Examples:
+    Examples
    ----------
        With only lengths.
        >>> lengths = [5, 3, 2]
@ -64,7 +74,6 @@ def make_pad_mask(lengths, length_dim=-1):
        masks = [[0, 0, 0, 0 ,0],
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]
    """
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@ -88,17 +97,24 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.
-    Args:
+    Parameters
-        lengths (LongTensor or List): Batch of lengths (B,).
+    ----------
-        xs (Tensor, optional): The reference tensor.
+        lengths : LongTensor or List
             Batch of lengths (B,).
        xs : Tensor, optional
            The reference tensor.
            If set, masks will be the same shape as this tensor.
-        length_dim (int, optional): Dimension indicator of the above tensor.
+        length_dim : int, optional
            Dimension indicator of the above tensor.
            See the example.
-    Returns:
+    Returns
-        ByteTensor: mask tensor containing indices of padded part bool.
+    ----------
        Tensor(bool)
            mask tensor containing indices of padded part bool.
-    Examples:
+    Examples
    ----------
        With only lengths.
        >>> lengths = [5, 3, 2]
@ -106,6 +122,37 @@ def make_non_pad_mask(lengths, length_dim=-1):
        masks = [[1, 1, 1, 1 ,1],
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
    """
    return paddle.logical_not(make_pad_mask(lengths, length_dim))
 def initialize(model: nn.Layer, init: str):
    """Initialize weights of a neural network module.
    Parameters are initialized using the given method or distribution.
    Custom initialization routines can be implemented into submodules
    Parameters
    ----------
        model : paddle.nn.Layer
            Target.
        init : str
            Method of initialization.
    """
    assert check_argument_types()
    if init == "xavier_uniform":
        nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
                                              nn.initializer.Constant())
    elif init == "xavier_normal":
        nn.initializer.set_global_initializer(nn.initializer.XavierNormal(),
                                              nn.initializer.Constant())
    elif init == "kaiming_uniform":
        nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(),
                                              nn.initializer.Constant())
    elif init == "kaiming_normal":
        nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(),
                                              nn.initializer.Constant())
    else:
        raise ValueError("Unknown initialization: " + init)