format docstrings

2021-07-13 07:55:56 +00:00 · 2021-07-13 07:55:56 +00:00 · 6553d1d723
parent 3af3c29a94
commit 6553d1d723
15 changed files with 597 additions and 371 deletions
--- a/parakeet/models/fastspeech2_new.py
+++ b/parakeet/models/fastspeech2_new.py
@ -12,28 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fastspeech2 related modules for paddle"""
-import logging
-import numpy as np

 from typing import Dict
 from typing import Sequence
 from typing import Tuple
-
 from typeguard import check_argument_types

 import paddle
+import numpy as np
 from paddle import nn
-
 from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
 from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
 from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
 from parakeet.modules.fastspeech2_predictor.postnet import Postnet
-from parakeet.modules.nets_utils import make_non_pad_mask
-from parakeet.modules.nets_utils import make_pad_mask
+from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
 from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
+from parakeet.modules.nets_utils import initialize
+from parakeet.modules.nets_utils import make_non_pad_mask
+from parakeet.modules.nets_utils import make_pad_mask


 class FastSpeech2(nn.Layer):
@ -155,7 +153,6 @@ class FastSpeech2(nn.Layer):
                positionwise_layer_type=positionwise_layer_type,
                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
        else:
-            print("encoder_type:", encoder_type)
            raise ValueError(f"{encoder_type} is not supported.")

        # define duration predictor
@ -236,6 +233,12 @@ class FastSpeech2(nn.Layer):
            use_batch_norm=use_batch_norm,
            dropout_rate=postnet_dropout_rate, ))

+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha, )
+
        # define criterions
        self.criterion = FastSpeech2Loss(
            use_masking=use_masking, use_weighted_masking=use_weighted_masking)
@ -253,25 +256,37 @@ class FastSpeech2(nn.Layer):
            energy: paddle.Tensor,
            energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[
                str, paddle.Tensor], paddle.Tensor]:
-        # """Calculate forward propagation.
+        """Calculate forward propagation.

-        # Args:
-        #     text (LongTensor): Batch of padded token ids (B, Tmax).
-        #     text_lengths (LongTensor): Batch of lengths of each input (B,).
-        #     speech (Tensor): Batch of padded target features (B, Lmax, odim).
-        #     speech_lengths (LongTensor): Batch of the lengths of each target (B,).
-        #     durations (LongTensor): Batch of padded durations (B, Tmax + 1).
-        #     durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1).
-        #     pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
-        #     pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1).
-        #     energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
-        #     energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1).
-        # Returns:
-        #     Tensor: Loss scalar value.
-        #     Dict: Statistics to be monitored.
-        #     Tensor: Weight value.
-
-        # """
+        Parameters
+        ----------
+            text : LongTensor
+                Batch of padded token ids (B, Tmax).
+            text_lengths : LongTensor)
+                Batch of lengths of each input (B,).
+            speech : Tensor
+                Batch of padded target features (B, Lmax, odim).
+            speech_lengths : LongTensor
+                Batch of the lengths of each target (B,).
+            durations : LongTensor
+                Batch of padded durations (B, Tmax + 1).
+            durations_lengths : LongTensor
+                Batch of duration lengths (B, Tmax + 1).
+            pitch : Tensor
+                Batch of padded token-averaged pitch (B, Tmax + 1, 1).
+            pitch_lengths : LongTensor
+                Batch of pitch lengths (B, Tmax + 1).
+            energy : Tensor
+                Batch of padded token-averaged energy (B, Tmax + 1, 1).
+            energy_lengths : LongTensor
+                Batch of energy lengths (B, Tmax + 1).
+        Returns
+        ----------
+            Tensor
+                Loss scalar value.
+            Dict
+                Statistics to be monitored.
+        """
        text = text[:, :text_lengths.max()]  # for data-parallel
        speech = speech[:, :speech_lengths.max()]  # for data-parallel
        durations = durations[:, :durations_lengths.max()]  # for data-parallel
@ -282,16 +297,11 @@ class FastSpeech2(nn.Layer):

        # Add eos at the last of sequence
        # xs = F.pad(text, [0, 1], "constant", self.padding_idx)
-        print("xs.shape in fastspeech2.py before:", text.shape, text)
        xs = np.pad(text.numpy(),
                    pad_width=((0, 0), (0, 1)),
                    mode="constant",
                    constant_values=self.padding_idx)
        xs = paddle.to_tensor(xs)
-        print("xs.shape in fastspeech2.py end:", xs.shape, xs)
-        # my_pad = nn.Pad1D(padding=[0, 1], mode="constant", value=self.padding_idx)
-        # xs = my_pad(text)
-        # 是否会数组越界？ xs 是否能取到 l -> 可以，因为上一步补充了一个 padding_idx，又变成了 eos
        for i, l in enumerate(text_lengths):
            xs[i, l] = self.eos
        ilens = text_lengths + 1
@ -302,23 +312,16 @@ class FastSpeech2(nn.Layer):
        # forward propagation
        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
            xs, ilens, ys, olens, ds, ps, es, is_inference=False)
-        print("d_outs in paddle:", d_outs)
-        print("p_outs in paddle:", p_outs)
-        print("e_outs in paddle:", e_outs)
-
        # modify mod part of groundtruth
        if self.reduction_factor > 1:
-            # 需要改
            olens = paddle.to_tensor([
                olen - olen % self.reduction_factor for olen in olens.numpy()
            ])
            max_olen = max(olens)
            ys = ys[:, :max_olen]
-
        # calculate loss
        if self.postnet is None:
            after_outs = None
-
        # calculate loss
        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
            after_outs=after_outs,
@ -363,9 +366,8 @@ class FastSpeech2(nn.Layer):
            alpha: float=1.0, ) -> Sequence[paddle.Tensor]:
        # forward encoder
        x_masks = self._source_mask(ilens)
-        print("xs.shape in fastspeech2.py:", xs.shape)
-        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)

+        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)
        # forward duration predictor and variance predictors
        d_masks = make_pad_mask(ilens)

@ -377,10 +379,11 @@ class FastSpeech2(nn.Layer):
            e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
        else:
            e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
-        print("p_outs.shape:", p_outs.shape)
+
        if is_inference:
            d_outs = self.duration_predictor.inference(hs,
                                                       d_masks)  # (B, Tmax)
+            # print("d_outs:",d_outs)
            # use prediction in inference
            # (B, Tmax, 1)

@ -404,7 +407,6 @@ class FastSpeech2(nn.Layer):
        # forward decoder
        if olens is not None and not is_inference:
            if self.reduction_factor > 1:
-                # 直接to_paddle ,维度会增加 1,需要先转成 numpy
                olens_in = paddle.to_tensor(
                    [olen // self.reduction_factor for olen in olens.numpy()])
            else:
@ -412,9 +414,10 @@ class FastSpeech2(nn.Layer):
            h_masks = self._source_mask(olens_in)
        else:
            h_masks = None
-        zs, _ = self.decoder(hs, h_masks)  # (B, Lmax, adim)
-        before_outs = self.feat_out(zs).reshape(
-            (zs.shape[0], -1, self.odim))  # (B, Lmax, odim)
+        # (B, Lmax, adim)
+        zs, _ = self.decoder(hs, h_masks)
+        # (B, Lmax, odim)
+        before_outs = self.feat_out(zs).reshape((zs.shape[0], -1, self.odim))

        # postnet -> (B, Lmax//r * r, odim)
        if self.postnet is None:
@ -437,20 +440,30 @@ class FastSpeech2(nn.Layer):
                paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Args:
-            text (LongTensor): Input sequence of characters (T,).
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
-            durations (LongTensor, optional): Groundtruth of duration (T + 1,).
-            pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
-            energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
-            alpha (float, optional): Alpha to control the speed.
-            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
-                If true, groundtruth of duration, pitch and energy will be used.
+        Parameters
+        ----------
+            text : LongTensor
+                Input sequence of characters (T,).
+            speech : Tensor, optional
+                Feature sequence to extract style (N, idim).
+            durations : LongTensor, optional
+                Groundtruth of duration (T + 1,).
+            pitch : Tensor, optional
+                Groundtruth of token-averaged pitch (T + 1, 1).
+            energy : Tensor, optional
+                Groundtruth of token-averaged energy (T + 1, 1).
+            alpha : float, optional
+                 Alpha to control the speed.
+            use_teacher_forcing : bool, optional
+                 Whether to use teacher forcing.
+                 If true, groundtruth of duration, pitch and energy will be used.

-        Returns:
-            Tensor: Output sequence of features (L, odim).
-            None: Dummy for compatibility.
-            None: Dummy for compatibility.
+        Returns
+        ----------
+            Tensor
+                Output sequence of features (L, odim).
+            None
+                Dummy for compatibility.

        """
        x, y = text, speech
@ -460,13 +473,15 @@ class FastSpeech2(nn.Layer):
        x = np.pad(text.numpy(),
                   pad_width=((0, 1)),
                   mode="constant",
-                   constant_values=self.padding_idx)
+                   constant_values=self.eos)
+
        x = paddle.to_tensor(x)

        # setup batch axis
        ilens = paddle.to_tensor(
            [x.shape[0]], dtype=paddle.int64, place=x.place)
        xs, ys = x.unsqueeze(0), None
+
        if y is not None:
            ys = y.unsqueeze(0)

@ -493,14 +508,19 @@ class FastSpeech2(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.

-        Args:
-            ilens (LongTensor): Batch of lengths (B,).
+        Parameters
+        ----------
+            ilens : LongTensor
+                Batch of lengths (B,).

-        Returns:
-            Tensor: Mask tensor for self-attention.
+        Returns
+        -------
+            Tensor
+                Mask tensor for self-attention.
                dtype=paddle.bool

-        Examples:
+        Examples
+        -------
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
@ -510,6 +530,29 @@ class FastSpeech2(nn.Layer):
        x_masks = make_non_pad_mask(ilens)
        return x_masks.unsqueeze(-2)

+    def _reset_parameters(self,
+                          init_type: str,
+                          init_enc_alpha: float,
+                          init_dec_alpha: float):
+        # initialize parameters
+        initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            init_enc_alpha = paddle.to_tensor(init_enc_alpha)
+            self.encoder.embed[-1].alpha = paddle.create_parameter(
+                shape=init_enc_alpha.shape,
+                dtype=str(init_enc_alpha.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(
+                    init_enc_alpha))
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            init_dec_alpha = paddle.to_tensor(init_dec_alpha)
+            self.decoder.embed[-1].alpha = paddle.create_parameter(
+                shape=init_dec_alpha.shape,
+                dtype=str(init_dec_alpha.numpy().dtype),
+                default_initializer=paddle.nn.initializer.Assign(
+                    init_dec_alpha))
+

 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""
@ -519,12 +562,12 @@ class FastSpeech2Loss(nn.Layer):
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.

-        Args:
-            use_masking (bool):
+        Parameters
+        ----------
+            use_masking : bool
                Whether to apply masking for padded part in loss calculation.
-            use_weighted_masking (bool):
+            use_weighted_masking : bool
                Whether to weighted masking in loss calculation.
-
        """
        assert check_argument_types()
        super().__init__()
@ -555,24 +598,41 @@ class FastSpeech2Loss(nn.Layer):
                                             paddle.Tensor, paddle.Tensor]:
        """Calculate forward propagation.

-        Args:
-            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
-            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
-            d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
-            p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
-            e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
-            ys (Tensor): Batch of target features (B, Lmax, odim).
-            ds (LongTensor): Batch of durations (B, Tmax).
-            ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
-            es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
-            ilens (LongTensor): Batch of the lengths of each input (B,).
-            olens (LongTensor): Batch of the lengths of each target (B,).
+        Parameters
+        ----------
+            after_outs : Tensor
+                Batch of outputs after postnets (B, Lmax, odim).
+            before_outs : Tensor
+                Batch of outputs before postnets (B, Lmax, odim).
+            d_outs : LongTensor
+                 Batch of outputs of duration predictor (B, Tmax).
+            p_outs : Tensor
+                Batch of outputs of pitch predictor (B, Tmax, 1).
+            e_outs : Tensor
+                Batch of outputs of energy predictor (B, Tmax, 1).
+            ys : Tensor
+                Batch of target features (B, Lmax, odim).
+            ds : LongTensor
+                Batch of durations (B, Tmax).
+            ps : Tensor
+                Batch of target token-averaged pitch (B, Tmax, 1).
+            es : Tensor
+                Batch of target token-averaged energy (B, Tmax, 1).
+            ilens : LongTensor
+                Batch of the lengths of each input (B,).
+            olens : LongTensor
+                Batch of the lengths of each target (B,).

-        Returns:
-            Tensor: L1 loss value.
-            Tensor: Duration predictor loss value.
-            Tensor: Pitch predictor loss value.
-            Tensor: Energy predictor loss value.
+        Returns
+        ----------
+            Tensor
+                L1 loss value.
+            Tensor
+                Duration predictor loss value.
+            Tensor
+                Pitch predictor loss value.
+            Tensor
+                Energy predictor loss value.

        """
        # apply mask to remove padded part
--- a/parakeet/modules/fastspeech2_predictor/duration_predictor.py
+++ b/parakeet/modules/fastspeech2_predictor/duration_predictor.py
@ -15,7 +15,6 @@

 import paddle
 from paddle import nn
-
 from parakeet.modules.layer_norm import LayerNorm
 from parakeet.modules.masked_fill import masked_fill

@ -31,7 +30,8 @@ class DurationPredictor(nn.Layer):
    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf

-    Note:
+    Note
+    ----------
        The calculation domain of outputs is different
        between in `forward` and in `inference`. In `forward`,
        the outputs are calculated in log domain but in `inference`,
@ -48,13 +48,20 @@ class DurationPredictor(nn.Layer):
                 offset=1.0):
        """Initilize duration predictor module.

-        Args:
-            idim (int): Input dimension.
-            n_layers (int, optional): Number of convolutional layers.
-            n_chans (int, optional): Number of channels of convolutional layers.
-            kernel_size (int, optional): Kernel size of convolutional layers.
-            dropout_rate (float, optional): Dropout rate.
-            offset (float, optional): Offset value to avoid nan in log domain.
+        Parameters
+        ----------
+            idim : int
+                Input dimension.
+            n_layers : int, optional
+                 Number of convolutional layers.
+            n_chans : int, optional
+                Number of channels of convolutional layers.
+            kernel_size : int, optional
+                Kernel size of convolutional layers.
+            dropout_rate : float, optional
+                 Dropout rate.
+            offset : float, optional
+                Offset value to avoid nan in log domain.

        """
        super(DurationPredictor, self).__init__()
@ -74,7 +81,7 @@ class DurationPredictor(nn.Layer):
                    LayerNorm(
                        n_chans, dim=1),
                    nn.Dropout(dropout_rate), ))
-        self.linear = nn.Linear(n_chans, 1)
+        self.linear = nn.Linear(n_chans, 1, bias_attr=True)

    def _forward(self, xs, x_masks=None, is_inference=False):
        # (B, idim, Tmax)
@ -83,7 +90,7 @@ class DurationPredictor(nn.Layer):
        for f in self.conv:
            xs = f(xs)

-            # NOTE: calculate in log domain
+        # NOTE: calculate in log domain
        # (B, Tmax)
        xs = self.linear(xs.transpose([0, 2, 1])).squeeze(-1)

@ -99,28 +106,34 @@ class DurationPredictor(nn.Layer):
    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.

-        Args:
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
-            x_masks (ByteTensor, optional):
+        Parameters
+        ----------
+            xs : Tensor
+                Batch of input sequences (B, Tmax, idim).
+            x_masks : ByteTensor, optional
                Batch of masks indicating padded part (B, Tmax).

-        Returns:
-            Tensor: Batch of predicted durations in log domain (B, Tmax).
-
+        Returns
+        ----------
+            Tensor
+                Batch of predicted durations in log domain (B, Tmax).
        """
        return self._forward(xs, x_masks, False)

    def inference(self, xs, x_masks=None):
        """Inference duration.

-        Args:
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
-            x_masks (ByteTensor, optional):
+        Parameters
+        ----------
+            xs : Tensor
+                Batch of input sequences (B, Tmax, idim).
+            x_masks : Tensor(bool), optional
                Batch of masks indicating padded part (B, Tmax).

-        Returns:
-            LongTensor: Batch of predicted durations in linear domain int64 (B, Tmax).
-
+        Returns
+        ----------
+            LongTensor
+                Batch of predicted durations in linear domain int64 (B, Tmax).
        """
        return self._forward(xs, x_masks, True)

@ -135,10 +148,12 @@ class DurationPredictorLoss(nn.Layer):
    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.

-        Args:
-            offset (float, optional): Offset value to avoid nan in log domain.
-            reduction (str): Reduction type in loss calculation.
-
+        Parameters
+        ----------
+            offset : float, optional
+                Offset value to avoid nan in log domain.
+            reduction : str
+                Reduction type in loss calculation.
        """
        super(DurationPredictorLoss, self).__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
@ -147,16 +162,21 @@ class DurationPredictorLoss(nn.Layer):
    def forward(self, outputs, targets):
        """Calculate forward propagation.

-        Args:
-            outputs (Tensor): Batch of prediction durations in log domain (B, T)
-            targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
+        Parameters
+        ----------
+            outputs : Tensor
+                Batch of prediction durations in log domain (B, T)
+            targets : LongTensor
+                Batch of groundtruth durations in linear domain (B, T)

-        Returns:
-            Tensor: Mean squared error loss value.
+        Returns
+        ----------
+            Tensor
+                Mean squared error loss value.

-        Note:
+        Note
+        ----------
            `outputs` is in log domain but `targets` is in linear domain.
-
        """
        # NOTE: outputs is in log domain while targets in linear
        targets = paddle.log(targets.cast(dtype='float32') + self.offset)
--- a/parakeet/modules/fastspeech2_predictor/length_regulator.py
+++ b/parakeet/modules/fastspeech2_predictor/length_regulator.py
@ -13,8 +13,6 @@
 # limitations under the License.
 """Length regulator related modules."""

-import logging
-
 import numpy as np
 import paddle
 from paddle import nn
@ -37,8 +35,10 @@ class LengthRegulator(nn.Layer):
    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.

-        Args:
-            pad_value (float, optional): Value used for padding.
+        Parameters
+        ----------
+            pad_value : float, optional
+                Value used for padding.

        """
        super().__init__()
@ -68,14 +68,19 @@ class LengthRegulator(nn.Layer):
    def forward(self, xs, ds, alpha=1.0):
        """Calculate forward propagation.

-        Args:
-            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-            ds (LongTensor): Batch of durations of each frame (B, T).
-            alpha (float, optional): Alpha value to control speed of speech.
-
-        Returns:
-            Tensor: replicated input tensor based on durations (B, T*, D).
+        Parameters
+        ----------
+            xs : Tensor
+                Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds : LongTensor
+                 Batch of durations of each frame (B, T).
+            alpha : float, optional
+                Alpha value to control speed of speech.

+        Returns
+        ----------
+            Tensor
+                replicated input tensor based on durations (B, T*, D).
        """
        if alpha != 1.0:
            assert alpha > 0
--- a/parakeet/modules/fastspeech2_predictor/postnet.py
+++ b/parakeet/modules/fastspeech2_predictor/postnet.py
@ -43,15 +43,22 @@ class Postnet(nn.Layer):
            use_batch_norm=True, ):
        """Initialize postnet module.

-        Args:
-            idim (int): Dimension of the inputs.
-            odim (int): Dimension of the outputs.
-            n_layers (int, optional): The number of layers.
-            n_filts (int, optional): The number of filter size.
-            n_units (int, optional): The number of filter channels.
-            use_batch_norm (bool, optional): Whether to use batch normalization..
-            dropout_rate (float, optional): Dropout rate..
-
+        Parameters
+        ----------
+            idim : int
+                Dimension of the inputs.
+            odim : int
+                Dimension of the outputs.
+            n_layers : int, optional
+                The number of layers.
+            n_filts : int, optional
+                The number of filter size.
+            n_units : int, optional
+                The number of filter channels.
+            use_batch_norm : bool, optional
+                Whether to use batch normalization..
+            dropout_rate : float, optional
+                Dropout rate..
        """
        super(Postnet, self).__init__()
        self.postnet = nn.LayerList()
@ -111,11 +118,15 @@ class Postnet(nn.Layer):
    def forward(self, xs):
        """Calculate forward propagation.

-        Args:
-            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+        Parameters
+        ----------
+            xs : Tensor
+                Batch of the sequences of padded input tensors (B, idim, Tmax).

-        Returns:
-            Tensor: Batch of padded output tensor. (B, odim, Tmax).
+        Returns
+        ----------
+            Tensor
+                Batch of padded output tensor. (B, odim, Tmax).

        """
        for i in six.moves.range(len(self.postnet)):
--- a/parakeet/modules/fastspeech2_predictor/variance_predictor.py
+++ b/parakeet/modules/fastspeech2_predictor/variance_predictor.py
@ -15,10 +15,8 @@

 import paddle
 from paddle import nn
-
 from parakeet.modules.layer_norm import LayerNorm
 from parakeet.modules.masked_fill import masked_fill
-
 from typeguard import check_argument_types


@ -43,13 +41,18 @@ class VariancePredictor(nn.Layer):
            dropout_rate: float=0.5, ):
        """Initilize duration predictor module.

-        Args:
-            idim (int): Input dimension.
-            n_layers (int, optional): Number of convolutional layers.
-            n_chans (int, optional): Number of channels of convolutional layers.
-            kernel_size (int, optional): Kernel size of convolutional layers.
-            dropout_rate (float, optional): Dropout rate.
-
+        Parameters
+        ----------
+            idim : int
+                Input dimension.
+            n_layers : int, optional
+                Number of convolutional layers.
+            n_chans : int, optional
+                Number of channels of convolutional layers.
+            kernel_size : int, optional
+                Kernel size of convolutional layers.
+            dropout_rate : float, optional
+                Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -70,26 +73,30 @@ class VariancePredictor(nn.Layer):
                        n_chans, dim=1),
                    nn.Dropout(dropout_rate), ))

-        self.linear = nn.Linear(n_chans, 1)
+        self.linear = nn.Linear(n_chans, 1, bias_attr=True)

    def forward(self, xs: paddle.Tensor,
                x_masks: paddle.Tensor=None) -> paddle.Tensor:
        """Calculate forward propagation.

-        Args:
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
-            x_masks (ByteTensor, optional):
+        Parameters
+        ----------
+            xs : Tensor
+                Batch of input sequences (B, Tmax, idim).
+            x_masks : Tensor(bool), optional
                Batch of masks indicating padded part (B, Tmax, 1).

-        Returns:
-            Tensor: Batch of predicted sequences (B, Tmax, 1).
-
+        Returns
+        ----------
+            Tensor
+                Batch of predicted sequences (B, Tmax, 1).
        """
        # (B, idim, Tmax)
        xs = xs.transpose([0, 2, 1])
        # (B, C, Tmax)
        for f in self.conv:
-            xs = f(xs)  # (B, C, Tmax)
+            # (B, C, Tmax)
+            xs = f(xs)
        # (B, Tmax, 1)
        xs = self.linear(xs.transpose([0, 2, 1]))

--- a/parakeet/modules/fastspeech2_transformer/attention.py
+++ b/parakeet/modules/fastspeech2_transformer/attention.py
@ -16,23 +16,22 @@
 import math

 import numpy
-
 import paddle
 from paddle import nn
-
-from paddle.fluid.layers import sequence_mask
-
 from parakeet.modules.masked_fill import masked_fill


 class MultiHeadedAttention(nn.Layer):
    """Multi-Head Attention layer.

-    Args:
-        n_head (int): The number of heads.
-        n_feat (int): The number of features.
-        dropout_rate (float): Dropout rate.
-
+    Parameters
+    ----------
+        n_head : int
+            The number of heads.
+        n_feat : int
+            The number of features.
+        dropout_rate : float
+            Dropout rate.
    """

    def __init__(self, n_head, n_feat, dropout_rate):
@ -42,33 +41,42 @@ class MultiHeadedAttention(nn.Layer):
        # We assume d_v always equals d_k
        self.d_k = n_feat // n_head
        self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward_qkv(self, query, key, value):
        """Transform query, key and value.

-        Args:
-            query (paddle.Tensor): Query tensor (#batch, time1, size).
-            key (paddle.Tensor): Key tensor (#batch, time2, size).
-            value (paddle.Tensor): Value tensor (#batch, time2, size).
-
-        Returns:
-            paddle.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
-            paddle.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
-            paddle.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        Parameters
+        ----------
+            query : paddle.Tensor
+                query tensor (#batch, time1, size).
+            key : paddle.Tensor
+                Key tensor (#batch, time2, size).
+            value : paddle.Tensor
+                Value tensor (#batch, time2, size).

+        Returns
+        ----------
+            paddle.Tensor
+                Transformed query tensor (#batch, n_head, time1, d_k).
+            paddle.Tensor
+                Transformed key tensor (#batch, n_head, time2, d_k).
+            paddle.Tensor
+                Transformed value tensor (#batch, n_head, time2, d_k).
        """
        n_batch = query.shape[0]
+
        q = paddle.reshape(
            self.linear_q(query), [n_batch, -1, self.h, self.d_k])
        k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
        v = paddle.reshape(
            self.linear_v(value), [n_batch, -1, self.h, self.d_k])
+
        # (batch, head, time1, d_k)
        q = q.transpose((0, 2, 1, 3))
        # (batch, head, time2, d_k)
@ -80,44 +88,40 @@ class MultiHeadedAttention(nn.Layer):
    def forward_attention(self, value, scores, mask=None):
        """Compute attention context vector.

-        Args:
-            value (paddle.Tensor): Transformed value (#batch, n_head, time2, d_k).
-            scores (paddle.Tensor): Attention score (#batch, n_head, time1, time2).
-            mask (paddle.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+        Parameters
+        ----------
+            value : paddle.Tensor
+                Transformed value (#batch, n_head, time2, d_k).
+            scores : paddle.Tensor
+                Attention score (#batch, n_head, time1, time2).
+            mask :  paddle.Tensor
+                Mask (#batch, 1, time2) or (#batch, time1, time2).

-        Returns:
-            paddle.Tensor: Transformed value (#batch, time1, d_model)
+        Returns
+        ----------
+            paddle.Tensor:
+                Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).
-
        """
        n_batch = value.shape[0]
        softmax = paddle.nn.Softmax(axis=-1)
        if mask is not None:

            mask = mask.unsqueeze(1)
-            # mask 取反, pad 的位置变成 true，之后 pad 的位置被替换为 0
            mask = paddle.logical_not(mask)
-
-            # mask = paddle.cast(mask, dtype='int64')
-            # mask ==1 的位置用 min_value 代替
-            # scores = scores.masked_fill(mask, min_value)
            min_value = float(
                numpy.finfo(
                    paddle.to_tensor(
                        0, dtype=scores.dtype).numpy().dtype).min)

            scores = masked_fill(scores, mask, min_value)
-            self.attn = softmax(scores)  # (batch, head, time1, time2)
-
-            # 用value填充tensor中与mask中值为1位置相对应的元素 == 保留 mask 为0 的值
-            #  self.attn = torch.softmax(scores, dim=-1).masked_fill(
-            #     mask, 0.0
-            # )  # (batch, head, time1, time2)
-            # 保留 mask 为 0 的位置，其他变成 0
+            # (batch, head, time1, time2)
+            self.attn = softmax(scores)
            self.attn = masked_fill(self.attn, mask, 0.0)
        else:
-            self.attn = softmax(scores)  # (batch, head, time1, time2)
-        # (batch, head, time1, time2)
+            # (batch, head, time1, time2)
+            self.attn = softmax(scores)
+            # (batch, head, time1, time2)
        p_attn = self.dropout(self.attn)
        # (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
        x = paddle.matmul(p_attn, value)
@ -130,16 +134,21 @@ class MultiHeadedAttention(nn.Layer):
    def forward(self, query, key, value, mask=None):
        """Compute scaled dot product attention.

-        Args:
-            query (paddle.Tensor): Query tensor (#batch, time1, size).
-            key (paddle.Tensor): Key tensor (#batch, time2, size).
-            value (paddle.Tensor): Value tensor (#batch, time2, size).
-            mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
-                (#batch, time1, time2).
-
-        Returns:
-            paddle.Tensor: Output tensor (#batch, time1, d_model).
+        Parameters
+        ----------
+            query : paddle.Tensor
+                Query tensor (#batch, time1, size).
+            key : paddle.Tensor
+                Key tensor (#batch, time2, size).
+            value : paddle.Tensor
+                Value tensor (#batch, time2, size).
+            mask : paddle.Tensor
+                Mask tensor (#batch, 1, time2) or (#batch, time1, time2).

+        Returns
+        ----------
+            paddle.Tensor
+                Output tensor (#batch, time1, d_model).
        """
        q, k, v = self.forward_qkv(query, key, value)
        scores = paddle.matmul(q, k.transpose(
--- a/parakeet/modules/fastspeech2_transformer/embedding.py
+++ b/parakeet/modules/fastspeech2_transformer/embedding.py
@ -22,14 +22,16 @@ from paddle import nn
 class PositionalEncoding(nn.Layer):
    """Positional encoding.

-    Args:
-        d_model (int): Embedding dimension.
-        dropout_rate (float): Dropout rate.
-        max_len (int): Maximum input length.
-        reverse (bool): Whether to reverse the input position. Only for
-        the class LegacyRelPositionalEncoding. We remove it in the current
-        class RelPositionalEncoding.
-
+    Parameters
+    ----------
+        d_model : int
+            Embedding dimension.
+        dropout_rate : float
+            Dropout rate.
+        max_len : int
+            Maximum input length.
+        reverse : bool
+            Whether to reverse the input position. Only for
    """

    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
@ -47,7 +49,6 @@ class PositionalEncoding(nn.Layer):

        pe = paddle.zeros([x.shape[1], self.d_model])
        if self.reverse:
-            # (x.shape[1],1)
            position = paddle.arange(
                x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1)
        else:
@ -65,12 +66,15 @@ class PositionalEncoding(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.

-        Args:
-            x (paddle.Tensor): Input tensor (batch, time, `*`).
-
-        Returns:
-            paddle.Tensor: Encoded tensor (batch, time, `*`).
+        Parameters
+        ----------
+            x : paddle.Tensor
+                Input tensor (batch, time, `*`).

+        Returns
+        ----------
+            paddle.Tensor
+                Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        x = x * self.xscale + self.pe[:, :x.shape[1]]
@ -82,11 +86,14 @@ class ScaledPositionalEncoding(PositionalEncoding):

    See Sec. 3.2  https://arxiv.org/abs/1809.08895

-    Args:
-        d_model (int): Embedding dimension.
-        dropout_rate (float): Dropout rate.
-        max_len (int): Maximum input length.
-
+    Parameters
+    ----------
+        d_model : int
+            Embedding dimension.
+        dropout_rate : float
+            Dropout rate.
+        max_len : int
+            Maximum input length.
    """

    def __init__(self, d_model, dropout_rate, max_len=5000):
@ -106,12 +113,15 @@ class ScaledPositionalEncoding(PositionalEncoding):
    def forward(self, x):
        """Add positional encoding.

-        Args:
-            x (paddle.Tensor): Input tensor (batch, time, `*`).
-
-        Returns:
-            paddle.Tensor: Encoded tensor (batch, time, `*`).
+        Parameters
+        ----------
+            x : paddle.Tensor
+                Input tensor (batch, time, `*`).

+        Returns
+        ----------
+            paddle.Tensor
+                Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        x = x + self.alpha * self.pe[:, :x.shape[1]]
--- a/parakeet/modules/fastspeech2_transformer/encoder.py
+++ b/parakeet/modules/fastspeech2_transformer/encoder.py
@ -12,19 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import math
-
-import numpy
 import logging
-import paddle
+
 from paddle import nn
-from paddle.nn import functional as F
-from paddle.nn import initializer as I
-from paddle.fluid.layers import sequence_mask
-import sys
 from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
 from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
 from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
 from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
@ -35,28 +27,44 @@ from parakeet.modules.fastspeech2_transformer.repeat import repeat
 class Encoder(nn.Layer):
    """Transformer encoder module.

-    Args:
-        idim (int): Input dimension.
-        attention_dim (int): Dimention of attention.
-        attention_heads (int): The number of heads of multi head attention.
-        linear_units (int): The number of units of position-wise feed forward.
-        num_blocks (int): The number of decoder blocks.
-        dropout_rate (float): Dropout rate.
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
-        attention_dropout_rate (float): Dropout rate in attention.
-        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
-        pos_enc_class (paddle.nn.Layer): Positional encoding module class.
+    Parameters
+    ----------
+        idim : int
+            Input dimension.
+        attention_dim : int
+            Dimention of attention.
+        attention_heads : int
+            The number of heads of multi head attention.
+        linear_units : int
+            The number of units of position-wise feed forward.
+        num_blocks : int
+            The number of decoder blocks.
+        dropout_rate : float
+            Dropout rate.
+        positional_dropout_rate : float
+            Dropout rate after adding positional encoding.
+        attention_dropout_rate : float
+            Dropout rate in attention.
+        input_layer : Union[str, paddle.nn.Layer]
+            Input layer type.
+        pos_enc_class : paddle.nn.Layer
+            Positional encoding module class.
            `PositionalEncoding `or `ScaledPositionalEncoding`
-        normalize_before (bool): Whether to use layer_norm before the first block.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        normalize_before : bool
+            Whether to use layer_norm before the first block.
+        concat_after : bool
+            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
-        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
-        selfattention_layer_type (str): Encoder attention layer type.
-        padding_idx (int): Padding idx for input_layer=embed.
-
+        positionwise_layer_type : str
+            "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size : int
+            Kernel size of positionwise conv1d layer.
+        selfattention_layer_type : str
+            Encoder attention layer type.
+        padding_idx : int
+            Padding idx for input_layer=embed.
    """

    def __init__(
@ -82,7 +90,8 @@ class Encoder(nn.Layer):
        self.conv_subsampling_factor = 1
        if input_layer == "linear":
            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim),
+                nn.Linear(
+                    idim, attention_dim, bias_attr=True),
                nn.LayerNorm(attention_dim),
                nn.Dropout(dropout_rate),
                nn.ReLU(),
@ -169,14 +178,19 @@ class Encoder(nn.Layer):
    def forward(self, xs, masks):
        """Encode input sequence.

-        Args:
-            xs (paddle.Tensor): Input tensor (#batch, time, idim).
-            masks (paddle.Tensor): Mask tensor (#batch, time).
-
-        Returns:
-            paddle.Tensor: Output tensor (#batch, time, attention_dim).
-            paddle.Tensor: Mask tensor (#batch, time).
+        Parameters
+        ----------
+            xs : paddle.Tensor
+                Input tensor (#batch, time, idim).
+            masks : paddle.Tensor
+                Mask tensor (#batch, time).

+        Returns
+        ----------
+            paddle.Tensor
+                Output tensor (#batch, time, attention_dim).
+            paddle.Tensor
+                Mask tensor (#batch, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -187,16 +201,23 @@ class Encoder(nn.Layer):
    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.

-        Args:
-            xs (paddle.Tensor): Input tensor.
-            masks (paddle.Tensor): Mask tensor.
-            cache (List[paddle.Tensor]): List of cache tensors.
-
-        Returns:
-            paddle.Tensor: Output tensor.
-            paddle.Tensor: Mask tensor.
-            List[paddle.Tensor]: List of new cache tensors.
+        Parameters
+        ----------
+            xs : paddle.Tensor
+                Input tensor.
+            masks : paddle.Tensor
+                Mask tensor.
+            cache : List[paddle.Tensor]
+                 List of cache tensors.

+        Returns
+        ----------
+            paddle.Tensor
+                Output tensor.
+            paddle.Tensor
+                Mask tensor.
+            List[paddle.Tensor]
+                List of new cache tensors.
        """

        xs = self.embed(xs)
--- a/parakeet/modules/fastspeech2_transformer/encoder_layer.py
+++ b/parakeet/modules/fastspeech2_transformer/encoder_layer.py
@ -14,28 +14,31 @@
 """Encoder self-attention layer definition."""

 import paddle
-
 from paddle import nn


 class EncoderLayer(nn.Layer):
    """Encoder layer module.

-    Args:
-        size (int): Input dimension.
-        self_attn (paddle.nn.Layer): Self-attention module instance.
-            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
-            can be used as the argument.
-        feed_forward (paddle.nn.Layer): Feed-forward module instance.
-            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-            can be used as the argument.
-        dropout_rate (float): Dropout rate.
-        normalize_before (bool): Whether to use layer_norm before the first block.
-        concat_after (bool): Whether to concat attention layer's input and output.
+    Parameters
+    ----------
+        size : int
+            Input dimension.
+        self_attn : paddle.nn.Layer
+            Self-attention module instance.
+            `MultiHeadedAttention`  instance can be used as the argument.
+        feed_forward : paddle.nn.Layer
+            Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate : float
+            Dropout rate.
+        normalize_before : bool
+            Whether to use layer_norm before the first block.
+        concat_after : bool
+            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-
    """

    def __init__(
@ -57,20 +60,26 @@ class EncoderLayer(nn.Layer):
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
-            self.concat_linear = nn.Linear(size + size, size)
+            self.concat_linear = nn.Linear(size + size, size, bias_attr=True)

    def forward(self, x, mask, cache=None):
        """Compute encoded features.

-        Args:
-            x_input (paddle.Tensor): Input tensor (#batch, time, size).
-            mask (paddle.Tensor): Mask tensor for the input (#batch, time).
-            cache (paddle.Tensor): Cache tensor of the input (#batch, time - 1, size).
-
-        Returns:
-            paddle.Tensor: Output tensor (#batch, time, size).
-            paddle.Tensor: Mask tensor (#batch, time).
+        Parameters
+        ----------
+            x_input : paddle.Tensor
+                Input tensor (#batch, time, size).
+            mask : paddle.Tensor
+                Mask tensor for the input (#batch, time).
+            cache : paddle.Tensor
+                 Cache tensor of the input (#batch, time - 1, size).

+        Returns
+        ----------
+            paddle.Tensor
+                Output tensor (#batch, time, size).
+            paddle.Tensor
+                Mask tensor (#batch, time).
        """
        residual = x
        if self.normalize_before:
@ -82,7 +91,6 @@ class EncoderLayer(nn.Layer):
            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
            x_q = x[:, -1:, :]
            residual = residual[:, -1:, :]
-            # non-pad mask 变成 pad mask
            mask = None if mask is None else mask[:, -1:, :]

        if self.concat_after:
@ -90,6 +98,7 @@ class EncoderLayer(nn.Layer):
                (x, self.self_attn(x_q, x, x, mask)), axis=-1)
            x = residual + self.concat_linear(x_concat)
        else:
+
            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
        if not self.normalize_before:
            x = self.norm1(x)
--- a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py
@ -32,11 +32,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize MultiLayeredConv1d module.

-        Args:
-            in_chans (int): Number of input channels.
-            hidden_chans (int): Number of hidden channels.
-            kernel_size (int): Kernel size of conv1d.
-            dropout_rate (float): Dropout rate.
+        Parameters
+        ----------
+            in_chans : int
+                Number of input channels.
+            hidden_chans : int
+                Number of hidden channels.
+            kernel_size : int
+                Kernel size of conv1d.
+            dropout_rate : float
+                Dropout rate.

        """
        super(MultiLayeredConv1d, self).__init__()
@ -58,14 +63,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.

-        Args:
-            x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
-
-        Returns:
-            paddle.Tensor: Batch of output tensors (B, T, in_chans).
+        Parameters
+        ----------
+            x : paddle.Tensor
+                Batch of input tensors (B, T, in_chans).

+        Returns
+        ----------
+            paddle.Tensor
+                Batch of output tensors (B, T, in_chans).
        """
-        # x = paddle.nn.ReLU(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
            [0, 2, 1])
@ -81,12 +88,16 @@ class Conv1dLinear(paddle.nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize Conv1dLinear module.

-        Args:
-            in_chans (int): Number of input channels.
-            hidden_chans (int): Number of hidden channels.
-            kernel_size (int): Kernel size of conv1d.
-            dropout_rate (float): Dropout rate.
-
+        Parameters
+        ----------
+            in_chans : int
+                Number of input channels.
+            hidden_chans : int
+                Number of hidden channels.
+            kernel_size : int
+                Kernel size of conv1d.
+            dropout_rate : float
+                Dropout rate.
        """
        super(Conv1dLinear, self).__init__()
        self.w_1 = paddle.nn.Conv1D(
@ -95,18 +106,22 @@ class Conv1dLinear(paddle.nn.Layer):
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans)
+        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
        self.dropout = paddle.nn.Dropout(dropout_rate)
        self.relu = paddle.nn.ReLU()

    def forward(self, x):
        """Calculate forward propagation.

-        Args:
-            x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
+        Parameters
+        ----------
+            x : paddle.Tensor
+            Batch of input tensors (B, T, in_chans).

-        Returns:
-            paddle.Tensor: Batch of output tensors (B, T, in_chans).
+        Returns
+        ----------
+            paddle.Tensor
+                Batch of output tensors (B, T, in_chans).

        """
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
--- a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py
@ -19,11 +19,14 @@ import paddle
 class PositionwiseFeedForward(paddle.nn.Layer):
    """Positionwise feed forward layer.

-    Args:
-        idim (int): Input dimenstion.
-        hidden_units (int): The number of hidden units.
-        dropout_rate (float): Dropout rate.
-
+    Parameters
+    ----------
+        idim : int
+            Input dimenstion.
+        hidden_units : int
+            The number of hidden units.
+        dropout_rate : float
+            Dropout rate.
    """

    def __init__(self,
@ -33,8 +36,8 @@ class PositionwiseFeedForward(paddle.nn.Layer):
                 activation=paddle.nn.ReLU()):
        """Construct an PositionwiseFeedForward object."""
        super(PositionwiseFeedForward, self).__init__()
-        self.w_1 = paddle.nn.Linear(idim, hidden_units)
-        self.w_2 = paddle.nn.Linear(hidden_units, idim)
+        self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
+        self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
        self.dropout = paddle.nn.Dropout(dropout_rate)
        self.activation = activation

--- a/parakeet/modules/fastspeech2_transformer/repeat.py
+++ b/parakeet/modules/fastspeech2_transformer/repeat.py
@ -29,12 +29,16 @@ class MultiSequential(paddle.nn.Sequential):
 def repeat(N, fn):
    """Repeat module N times.

-    Args:
-        N (int): Number of repeat time.
-        fn (Callable): Function to generate module.
-
-    Returns:
-        MultiSequential: Repeated model instance.
+    Parameters
+    ----------
+        N : int
+            Number of repeat time.
+        fn : Callable
+            Function to generate module.

+    Returns
+    ----------
+        MultiSequential
+            Repeated model instance.
    """
    return MultiSequential(* [fn(n) for n in range(N)])
--- a/parakeet/modules/layer_norm.py
+++ b/parakeet/modules/layer_norm.py
@ -19,10 +19,12 @@ import paddle
 class LayerNorm(paddle.nn.LayerNorm):
    """Layer normalization module.

-    Args:
-        nout (int): Output dim size.
-        dim (int): Dimension to be normalized.
-
+    Parameters
+    ----------
+        nout : int
+            Output dim size.
+        dim : int
+            Dimension to be normalized.
    """

    def __init__(self, nout, dim=-1):
@ -33,12 +35,15 @@ class LayerNorm(paddle.nn.LayerNorm):
    def forward(self, x):
        """Apply layer normalization.

-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            torch.Tensor: Normalized tensor.
+        Parameters
+        ----------
+            x : paddle.Tensor
+                Input tensor.

+        Returns
+        ----------
+            paddle.Tensor
+                Normalized tensor.
        """
        if self.dim == -1:
            return super(LayerNorm, self).forward(x)
--- a/parakeet/modules/masked_fill.py
+++ b/parakeet/modules/masked_fill.py
@ -28,7 +28,7 @@ def is_broadcastable(shp1, shp2):
 def masked_fill(xs: paddle.Tensor,
                mask: paddle.Tensor,
                value: Union[float, int]):
-    # assert is_broadcastable(xs.shape, mask.shape) is True
+    assert is_broadcastable(xs.shape, mask.shape) is True
    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
    mask = mask.broadcast_to(bshape)
    trues = paddle.ones_like(xs) * value
--- a/parakeet/modules/nets_utils.py
+++ b/parakeet/modules/nets_utils.py
@ -13,20 +13,27 @@
 # limitations under the License.

 import paddle
+from paddle import nn
+from typeguard import check_argument_types


-# 按照这个 batch 里面最长的补零
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.

-    Args:
-        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-        pad_value (float): Value for padding.
+    Parameters
+    ----------
+        xs : List[Tensor]
+            List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value : float)
+            Value for padding.

-    Returns:
-        Tensor: Padded tensor (B, Tmax, `*`).
+    Returns
+    ----------
+        Tensor
+            Padded tensor (B, Tmax, `*`).

-    Examples:
+    Examples
+    ----------
        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
        >>> x
        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
@ -34,11 +41,9 @@ def pad_list(xs, pad_value):
        tensor([[1., 1., 1., 1.],
                [1., 1., 0., 0.],
                [1., 0., 0., 0.]])
-
    """
    n_batch = len(xs)
    max_len = max(x.shape[0] for x in xs)
-    # pad = xs[0].new(n_batch, max_len, *xs[0].shape[1:]).fill_(pad_value)
    pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)

    for i in range(n_batch):
@ -50,13 +55,18 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of padded part.

-    Args:
-        lengths (LongTensor or List): Batch of lengths (B,).
+    Parameters
+    ----------
+        lengths : LongTensor or List
+             Batch of lengths (B,).

-    Returns:
-        Tensor: Mask tensor containing indices of padded part bool.
+    Returns
+    ----------
+        Tensor(bool)
+            Mask tensor containing indices of padded part bool.

-    Examples:
+    Examples
+    ----------
        With only lengths.

        >>> lengths = [5, 3, 2]
@ -64,7 +74,6 @@ def make_pad_mask(lengths, length_dim=-1):
        masks = [[0, 0, 0, 0 ,0],
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]
-
    """
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@ -88,17 +97,24 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.

-    Args:
-        lengths (LongTensor or List): Batch of lengths (B,).
-        xs (Tensor, optional): The reference tensor.
+    Parameters
+    ----------
+        lengths : LongTensor or List
+             Batch of lengths (B,).
+        xs : Tensor, optional
+            The reference tensor.
            If set, masks will be the same shape as this tensor.
-        length_dim (int, optional): Dimension indicator of the above tensor.
+        length_dim : int, optional
+            Dimension indicator of the above tensor.
            See the example.

-    Returns:
-        ByteTensor: mask tensor containing indices of padded part bool.
+    Returns
+    ----------
+        Tensor(bool)
+            mask tensor containing indices of padded part bool.

-    Examples:
+    Examples
+    ----------
        With only lengths.

        >>> lengths = [5, 3, 2]
@ -106,6 +122,37 @@ def make_non_pad_mask(lengths, length_dim=-1):
        masks = [[1, 1, 1, 1 ,1],
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
-
    """
    return paddle.logical_not(make_pad_mask(lengths, length_dim))
+
+
+def initialize(model: nn.Layer, init: str):
+    """Initialize weights of a neural network module.
+
+    Parameters are initialized using the given method or distribution.
+
+    Custom initialization routines can be implemented into submodules
+
+    Parameters
+    ----------
+        model : paddle.nn.Layer
+            Target.
+        init : str
+            Method of initialization.
+    """
+    assert check_argument_types()
+
+    if init == "xavier_uniform":
+        nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
+                                              nn.initializer.Constant())
+    elif init == "xavier_normal":
+        nn.initializer.set_global_initializer(nn.initializer.XavierNormal(),
+                                              nn.initializer.Constant())
+    elif init == "kaiming_uniform":
+        nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(),
+                                              nn.initializer.Constant())
+    elif init == "kaiming_normal":
+        nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(),
+                                              nn.initializer.Constant())
+    else:
+        raise ValueError("Unknown initialization: " + init)