add models & modules back

2019-11-25 03:40:52 +00:00 · 2019-11-25 03:40:52 +00:00 · de4c5d4f49
parent a715e6029d
commit de4c5d4f49
24 changed files with 1701 additions and 0 deletions
--- a/parakeet/models/deepvoice3/README.md
+++ b/parakeet/models/deepvoice3/README.md
--- a/parakeet/models/deepvoice3/README_cn.md
+++ b/parakeet/models/deepvoice3/README_cn.md
--- a/parakeet/models/deepvoice3/_ce.py
+++ b/parakeet/models/deepvoice3/_ce.py
--- a/parakeet/models/deepvoice3/_images/model_architecture.png
+++ b/parakeet/models/deepvoice3/_images/model_architecture.png
--- a/parakeet/models/deepvoice3/audio.py
+++ b/parakeet/models/deepvoice3/audio.py
--- a/parakeet/models/deepvoice3/builder.py
+++ b/parakeet/models/deepvoice3/builder.py
--- a/parakeet/models/deepvoice3/compute_timestamp_ratio.py
+++ b/parakeet/models/deepvoice3/compute_timestamp_ratio.py
--- a/parakeet/models/deepvoice3/data.py
+++ b/parakeet/models/deepvoice3/data.py
--- a/parakeet/models/deepvoice3/deepvoice3.py
+++ b/parakeet/models/deepvoice3/deepvoice3.py
--- a/parakeet/models/deepvoice3/dry_run.py
+++ b/parakeet/models/deepvoice3/dry_run.py
--- a/parakeet/models/deepvoice3/eval_model.py
+++ b/parakeet/models/deepvoice3/eval_model.py
--- a/parakeet/models/deepvoice3/hparams.py
+++ b/parakeet/models/deepvoice3/hparams.py
--- a/parakeet/models/deepvoice3/ljspeech.py
+++ b/parakeet/models/deepvoice3/ljspeech.py
--- a/parakeet/models/deepvoice3/preprocess.py
+++ b/parakeet/models/deepvoice3/preprocess.py
--- a/parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json
+++ b/parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json
--- a/parakeet/models/deepvoice3/synthesis.py
+++ b/parakeet/models/deepvoice3/synthesis.py
--- a/parakeet/models/deepvoice3/train.py
+++ b/parakeet/models/deepvoice3/train.py
--- a/parakeet/models/deepvoice3/train.sh
+++ b/parakeet/models/deepvoice3/train.sh
--- a/parakeet/models/deepvoice3/train_model.py
+++ b/parakeet/models/deepvoice3/train_model.py
--- a/parakeet/modules/init.py
+++ b/parakeet/modules/init.py
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
@ -0,0 +1,222 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+
+import paddle
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+
+from weight_norm import Conv2D, Conv2DTranspose
+
+
+class Conv1D(dg.Layer):
+    """
+    A convolution 1D block implemented with Conv2D. Form simplicity and 
+    ensuring the output has the same length as the input, it does not allow 
+    stride > 1.
+    """
+
+    def __init__(self,
+                 name_scope,
+                 in_cahnnels,
+                 num_filters,
+                 filter_size=3,
+                 dilation=1,
+                 groups=None,
+                 causal=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 dtype="float32"):
+        super(Conv1D, self).__init__(name_scope, dtype=dtype)
+
+        if causal:
+            padding = dilation * (filter_size - 1)
+        else:
+            padding = (dilation * (filter_size - 1)) // 2
+
+        self.in_channels = in_cahnnels
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.dilation = dilation
+        self.causal = causal
+        self.padding = padding
+        self.act = act
+
+        self.conv = Conv2D(
+            self.full_name(),
+            num_filters=num_filters,
+            filter_size=(1, filter_size),
+            stride=(1, 1),
+            dilation=(1, dilation),
+            padding=(0, padding),
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)
+
+    def forward(self, x):
+        """
+        Args:
+            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
+                input channels.
+
+        Returns:
+            x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
+                output channels (num_filters).
+        """
+        x = self.conv(x)
+        if self.filter_size > 1:
+            if self.causal:
+                x = fluid.layers.slice(
+                    x, axes=[3], starts=[0], ends=[-self.padding])
+            elif self.filter_size % 2 == 0:
+                x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
+        return x
+
+    def start_new_sequence(self):
+        self.temp_weight = None
+        self.input_buffer = None
+
+    def add_input(self, x):
+        """
+        Adding input for a time step and compute an output for a time step.
+        
+        Args:
+            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
+                input channels, and T = 1.
+
+        Returns:
+            out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
+            means output channels (num_filters), and T = 1.
+            
+        """
+        if self.temp_weight is None:
+            self.temp_weight = self._reshaped_weight()
+
+        window_size = 1 + (self.filter_size - 1) * self.dilation
+        batch_size = x.shape[0]
+        in_channels = x.shape[1]
+
+        if self.filter_size > 1:
+            if self.input_buffer is None:
+                self.input_buffer = fluid.layers.fill_constant(
+                    [batch_size, in_channels, 1, window_size - 1],
+                    dtype=x.dtype,
+                    value=0.0)
+            else:
+                self.input_buffer = self.input_buffer[:, :, :, 1:]
+            self.input_buffer = fluid.layers.concat(
+                [self.input_buffer, x], axis=3)
+            x = self.input_buffer
+            if self.dilation > 1:
+                if not hasattr(self, "indices"):
+                    self.indices = dg.to_variable(
+                        np.arange(0, window_size, self.dilation))
+                tmp = fluid.layers.transpose(
+                    self.input_buffer, perm=[3, 1, 2, 0])
+                tmp = fluid.layers.gather(tmp, index=self.indices)
+                tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
+                x = tmp
+        inputs = fluid.layers.reshape(
+            x, shape=[batch_size, in_channels * 1 * self.filter_size])
+        out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
+        out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
+        out = fluid.layers.reshape(out, out.shape + [1, 1])
+        out = self._helper.append_activation(out, act=self.act)
+        return out
+
+    def _reshaped_weight(self):
+        """
+        Get the linearized weight of convolution filter, cause it is by nature 
+        a matmul weight. And because the model uses weight norm, compute the
+        weight by weight_v * weight_g to make it faster.
+
+        Returns:
+            weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
+        """
+        shape = self.conv._filter_param_v.shape
+        matrix_shape = [shape[0], np.prod(shape[1:])]
+        weight_matrix = fluid.layers.reshape(
+            self.conv._filter_param_v, shape=matrix_shape)
+        weight_matrix = fluid.layers.elementwise_mul(
+            fluid.layers.l2_normalize(
+                weight_matrix, axis=1),
+            self.conv._filter_param_g,
+            axis=0)
+        return weight_matrix
+
+
+class Conv1DTranspose(dg.Layer):
+    """
+    A convolutional transpose 1D block implemented with convolutional transpose
+    2D. It does not ensure that the output is exactly expanded stride times in 
+    time dimension.
+    """
+
+    def __init__(self,
+                 name_scope,
+                 in_channels,
+                 num_filters,
+                 filter_size,
+                 padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 dtype="float32"):
+        super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype)
+
+        self.in_channels = in_channels
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+
+        self.conv_transpose = Conv2DTranspose(
+            self.full_name(),
+            num_filters,
+            filter_size=(1, filter_size),
+            padding=(0, padding),
+            stride=(1, stride),
+            dilation=(1, dilation),
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)
+
+    def forward(self, x):
+        """
+        Argss:
+            x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input
+                channels and T_in means the number of time steps of input.
+        
+        Returns:
+            out (Variable): shape(B, C_out, 1, T_out), where C_out means the
+                output channels and T_out means the number of time steps of
+                input.
+        """
+        return self.conv_transpose(x)
--- a/parakeet/modules/loss.py
+++ b/parakeet/modules/loss.py
@ -0,0 +1,158 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from numba import jit
+
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+
+
+def masked_mean(inputs, mask):
+    """
+    Args:
+        inputs (Variable): Shape(B, C, 1, T), the input, where B means
+            batch size, C means channels of input, T means timesteps of
+            the input.
+        mask (Variable): Shape(B, T), a mask. 
+    Returns:
+        loss (Variable): Shape(1, ), masked mean.
+    """
+    channels = inputs.shape[1]
+    reshaped_mask = fluid.layers.reshape(
+        mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
+    expanded_mask = fluid.layers.expand(
+        reshaped_mask, expand_times=[1, channels, 1, 1])
+    expanded_mask.stop_gradient = True
+
+    valid_cnt = fluid.layers.reduce_sum(expanded_mask)
+    valid_cnt.stop_gradient = True
+
+    masked_inputs = inputs * expanded_mask
+    loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
+    return loss
+
+
+@jit(nopython=True)
+def guided_attention(N, max_N, T, max_T, g):
+    W = np.zeros((max_N, max_T), dtype=np.float32)
+    for n in range(N):
+        for t in range(T):
+            W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
+    return W
+
+
+def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
+    B = len(input_lengths)
+    max_input_len = input_lengths.max()
+    W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
+    for b in range(B):
+        W[b] = guided_attention(input_lengths[b], max_input_len,
+                                target_lengths[b], max_target_len, g).T
+    return W
+
+
+class TTSLoss(object):
+    def __init__(self,
+                 masked_weight=0.0,
+                 priority_weight=0.0,
+                 binary_divergence_weight=0.0,
+                 guided_attention_sigma=0.2):
+        self.masked_weight = masked_weight
+        self.priority_weight = priority_weight
+        self.binary_divergence_weight = binary_divergence_weight
+        self.guided_attention_sigma = guided_attention_sigma
+
+    def l1_loss(self, prediction, target, mask, priority_bin=None):
+        abs_diff = fluid.layers.abs(prediction - target)
+
+        # basic mask-weighted l1 loss
+        w = self.masked_weight
+        if w > 0 and mask is not None:
+            base_l1_loss = w * masked_mean(abs_diff, mask) + (
+                1 - w) * fluid.layers.reduce_mean(abs_diff)
+        else:
+            base_l1_loss = fluid.layers.reduce_mean(abs_diff)
+
+        if self.priority_weight > 0 and priority_bin is not None:
+            # mask-weighted priority channels' l1-loss
+            priority_abs_diff = fluid.layers.slice(
+                abs_diff, axes=[1], starts=[0], ends=[priority_bin])
+            if w > 0 and mask is not None:
+                priority_loss = w * masked_mean(priority_abs_diff, mask) + (
+                    1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
+            else:
+                priority_loss = fluid.layers.reduce_mean(priority_abs_diff)
+
+            # priority weighted sum
+            p = self.priority_weight
+            loss = p * priority_loss + (1 - p) * base_l1_loss
+        else:
+            loss = base_l1_loss
+        return loss
+
+    def binary_divergence(self, prediction, target, mask):
+        flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
+        flattened_target = fluid.layers.reshape(target, [-1, 1])
+        flattened_loss = fluid.layers.log_loss(
+            flattened_prediction, flattened_target, epsilon=1e-8)
+        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
+
+        w = self.masked_weight
+        if w > 0 and mask is not None:
+            loss = w * masked_mean(bin_div, mask) + (
+                1 - w) * fluid.layers.reduce_mean(bin_div)
+        else:
+            loss = fluid.layers.reduce_mean(bin_div)
+        return loss
+
+    @staticmethod
+    def done_loss(done_hat, done):
+        flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
+        flat_done = fluid.layers.reshape(done, [-1, 1])
+        loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
+        loss = fluid.layers.reduce_mean(loss)
+        return loss
+
+    def attention_loss(self, predicted_attention, input_lengths,
+                       target_lengths):
+        """
+        Given valid encoder_lengths and decoder_lengths, compute a diagonal 
+        guide, and compute loss from the predicted attention and the guide.
+        
+        Args:
+            predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the 
+                alignment tensor, where B means batch size, T_dec means number
+                of time steps of the decoder, T_enc means the number of time
+                steps of the encoder, * means other possible dimensions.
+            input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
+                (time steps) of encoder outputs.
+            target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64, 
+                valid lengths (time steps) of decoder outputs.
+        
+        Returns:
+            loss (Variable): Shape(1, ) attention loss.
+        """
+        n_attention, batch_size, max_target_len, max_input_len = (
+            predicted_attention.shape)
+        soft_mask = guided_attentions(input_lengths, target_lengths,
+                                      max_target_len,
+                                      self.guided_attention_sigma)
+        soft_mask_ = dg.to_variable(soft_mask)
+        loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
+        return loss
--- a/parakeet/modules/modules.py
+++ b/parakeet/modules/modules.py
@ -0,0 +1,458 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+
+import numpy as np
+
+import conv
+import weight_norm as weight_norm
+
+
+def FC(name_scope,
+       in_features,
+       size,
+       num_flatten_dims=1,
+       dropout=0.0,
+       epsilon=1e-30,
+       act=None,
+       is_test=False,
+       dtype="float32"):
+    """
+    A special Linear Layer, when it is used with dropout, the weight is 
+    initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
+    """
+
+    # stds
+    if isinstance(in_features, int):
+        in_features = [in_features]
+    stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
+    weight_inits = [
+        fluid.initializer.NormalInitializer(scale=std) for std in stds
+    ]
+    bias_init = fluid.initializer.ConstantInitializer(0.0)
+
+    # param attrs
+    weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
+    bias_attr = fluid.ParamAttr(initializer=bias_init)
+
+    layer = weight_norm.FC(name_scope,
+                           size,
+                           num_flatten_dims=num_flatten_dims,
+                           param_attr=weight_attrs,
+                           bias_attr=bias_attr,
+                           act=act,
+                           dtype=dtype)
+    return layer
+
+
+def Conv1D(name_scope,
+           in_channels,
+           num_filters,
+           filter_size=3,
+           dilation=1,
+           groups=None,
+           causal=False,
+           std_mul=1.0,
+           dropout=0.0,
+           use_cudnn=True,
+           act=None,
+           dtype="float32"):
+    """
+    A special Conv1D Layer, when it is used with dropout, the weight is 
+    initialized as 
+    normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
+    """
+    # std
+    std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
+    weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
+    bias_init = fluid.initializer.ConstantInitializer(0.0)
+
+    # param attrs
+    weight_attr = fluid.ParamAttr(initializer=weight_init)
+    bias_attr = fluid.ParamAttr(initializer=bias_init)
+
+    layer = conv.Conv1D(
+        name_scope,
+        in_channels,
+        num_filters,
+        filter_size,
+        dilation,
+        groups=groups,
+        causal=causal,
+        param_attr=weight_attr,
+        bias_attr=bias_attr,
+        use_cudnn=use_cudnn,
+        act=act,
+        dtype=dtype)
+    return layer
+
+
+def Embedding(name_scope,
+              num_embeddings,
+              embed_dim,
+              is_sparse=False,
+              is_distributed=False,
+              padding_idx=None,
+              std=0.01,
+              dtype="float32"):
+    # param attrs
+    weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
+        scale=std))
+    layer = dg.Embedding(
+        name_scope, (num_embeddings, embed_dim),
+        padding_idx=padding_idx,
+        param_attr=weight_attr,
+        dtype=dtype)
+    return layer
+
+
+class Conv1DGLU(dg.Layer):
+    """
+    A Convolution 1D block with GLU activation. It also applys dropout for the 
+    input x. It fuses speaker embeddings through a FC activated by softsign. It
+    has residual connection from the input x, and scale the output by 
+    np.sqrt(0.5).
+    """
+
+    def __init__(self,
+                 name_scope,
+                 n_speakers,
+                 speaker_dim,
+                 in_channels,
+                 num_filters,
+                 filter_size,
+                 dilation,
+                 std_mul=4.0,
+                 dropout=0.0,
+                 causal=False,
+                 residual=True,
+                 dtype="float32"):
+        super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
+
+        # conv spec
+        self.in_channels = in_channels
+        self.n_speakers = n_speakers
+        self.speaker_dim = speaker_dim
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.dilation = dilation
+        self.causal = causal
+        self.residual = residual
+
+        # weight init and dropout
+        self.std_mul = std_mul
+        self.dropout = dropout
+
+        if residual:
+            assert (
+                in_channels == num_filters
+            ), "this block uses residual connection"\
+                "the input_channes should equals num_filters"
+
+        self.conv = Conv1D(
+            self.full_name(),
+            in_channels,
+            2 * num_filters,
+            filter_size,
+            dilation,
+            causal=causal,
+            std_mul=std_mul,
+            dropout=dropout,
+            dtype=dtype)
+
+        if n_speakers > 1:
+            assert (speaker_dim is not None
+                    ), "speaker embed should not be null in multi-speaker case"
+            self.fc = Conv1D(
+                self.full_name(),
+                speaker_dim,
+                num_filters,
+                filter_size=1,
+                dilation=1,
+                causal=False,
+                act="softsign",
+                dtype=dtype)
+
+    def forward(self, x, speaker_embed_bc1t=None):
+        """
+        Args:
+            x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
+                layer, where B means batch_size, C_in means the input channels
+                T means input time steps.
+            speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
+                speaker embed, where C_sp means speaker embedding size. Note
+                that when using residual connection, the Conv1DGLU does not
+                change the number of channels, so out channels equals input
+                channels.
+
+        Returns:
+            x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
+                C_out means the output channels of Conv1DGLU.
+        """
+
+        residual = x
+        x = fluid.layers.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
+        x = self.conv(x)
+
+        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
+
+        if speaker_embed_bc1t is not None:
+            sp = self.fc(speaker_embed_bc1t)
+            content = content + sp
+
+        # glu
+        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
+
+        if self.residual:
+            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
+        return x
+
+    def add_input(self, x, speaker_embed_bc11=None):
+        """
+        Inputs:
+        x: shape(B, num_filters, 1, time_steps)
+        speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
+
+        Outputs:
+        out: shape(B, num_filters, 1, time_steps), where time_steps = 1
+        """
+
+        residual = x
+
+        # add step input and produce step output
+        x = fluid.layers.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
+        x = self.conv.add_input(x)
+
+        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
+
+        if speaker_embed_bc11 is not None:
+            sp = self.fc(speaker_embed_bc11)
+            content = content + sp
+
+        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
+
+        if self.residual:
+            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
+        return x
+
+
+def Conv1DTranspose(name_scope,
+                    in_channels,
+                    num_filters,
+                    filter_size,
+                    padding=0,
+                    stride=1,
+                    dilation=1,
+                    groups=None,
+                    std_mul=1.0,
+                    dropout=0.0,
+                    use_cudnn=True,
+                    act=None,
+                    dtype="float32"):
+    std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
+    weight_init = fluid.initializer.NormalInitializer(scale=std)
+    weight_attr = fluid.ParamAttr(initializer=weight_init)
+    bias_init = fluid.initializer.ConstantInitializer(0.0)
+    bias_attr = fluid.ParamAttr(initializer=bias_init)
+    layer = conv.Conv1DTranspose(
+        name_scope,
+        in_channels,
+        num_filters,
+        filter_size,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        groups=groups,
+        param_attr=weight_attr,
+        bias_attr=bias_attr,
+        use_cudnn=use_cudnn,
+        act=act,
+        dtype=dtype)
+    return layer
+
+
+def compute_position_embedding(rad):
+    # rad is a transposed radius, shape(embed_dim, n_vocab)
+    embed_dim, n_vocab = rad.shape
+
+    even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
+    odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
+
+    even_rads = fluid.layers.gather(rad, even_dims)
+    odd_rads = fluid.layers.gather(rad, odd_dims)
+
+    sines = fluid.layers.sin(even_rads)
+    cosines = fluid.layers.cos(odd_rads)
+
+    temp = fluid.layers.scatter(rad, even_dims, sines)
+    out = fluid.layers.scatter(temp, odd_dims, cosines)
+    out = fluid.layers.transpose(out, perm=[1, 0])
+    return out
+
+
+def position_encoding_init(n_position,
+                           d_pos_vec,
+                           position_rate=1.0,
+                           sinusoidal=True):
+    """ Init the sinusoid position encoding table """
+
+    # keep idx 0 for padding token position encoding zero vector
+    position_enc = np.array([[
+        position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
+        for i in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+
+    if sinusoidal:
+        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+
+    return position_enc
+
+
+class PositionEmbedding(dg.Layer):
+    def __init__(self,
+                 name_scope,
+                 n_position,
+                 d_pos_vec,
+                 position_rate=1.0,
+                 is_sparse=False,
+                 is_distributed=False,
+                 param_attr=None,
+                 max_norm=None,
+                 padding_idx=None,
+                 dtype="float32"):
+        super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
+        self.embed = dg.Embedding(
+            self.full_name(),
+            size=(n_position, d_pos_vec),
+            is_sparse=is_sparse,
+            is_distributed=is_distributed,
+            padding_idx=None,
+            param_attr=param_attr,
+            dtype=dtype)
+        self.set_weight(
+            position_encoding_init(
+                n_position,
+                d_pos_vec,
+                position_rate=position_rate,
+                sinusoidal=False).astype(dtype))
+
+        self._is_sparse = is_sparse
+        self._is_distributed = is_distributed
+        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
+        if self._remote_prefetch:
+            assert self._is_sparse is True and self._is_distributed is False
+
+        self._padding_idx = (-1 if padding_idx is None else padding_idx if
+                             padding_idx >= 0 else (n_position + padding_idx))
+        self._position_rate = position_rate
+        self._max_norm = max_norm
+        self._dtype = dtype
+
+    def set_weight(self, array):
+        assert self.embed._w.shape == list(array.shape), "shape does not match"
+        self.embed._w._ivar.value().get_tensor().set(
+            array, fluid.framework._current_expected_place())
+
+    def forward(self, indices, speaker_position_rate=None):
+        """
+        Args:
+            indices (Variable): Shape (B, T, 1), dtype: int64, position
+                indices, where B means the batch size, T means the time steps.
+            speaker_position_rate (Variable | float, optional), position
+                rate. It can be a float point number or a Variable with 
+                shape (1,), then this speaker_position_rate is used for every 
+                example. It can also be a Variable with shape (B, 1), which 
+                contains a speaker position rate for each speaker.
+        Returns:
+            out (Variable): Shape(B, C_pos), position embedding, where C_pos 
+                means position embedding size.
+        """
+        rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
+        batch_size = indices.shape[0]
+
+        if speaker_position_rate is None:
+            weight = compute_position_embedding(rad)
+            out = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="lookup_table",
+                inputs={"Ids": indices,
+                        "W": weight},
+                outputs={"Out": out},
+                attrs={
+                    "is_sparse": self._is_sparse,
+                    "is_distributed": self._is_distributed,
+                    "remote_prefetch": self._remote_prefetch,
+                    "padding_idx":
+                    self._padding_idx,  # special value for lookup table op
+                })
+            return out
+
+        elif (np.isscalar(speaker_position_rate) or
+              isinstance(speaker_position_rate, fluid.framework.Variable) and
+              speaker_position_rate.shape == [1, 1]):
+            # # make a weight
+            # scale the weight (the operand for sin & cos)
+            if np.isscalar(speaker_position_rate):
+                scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
+            else:
+                scaled_rad = fluid.layers.elementwise_mul(
+                    rad, speaker_position_rate[0])
+            weight = compute_position_embedding(scaled_rad)
+            out = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="lookup_table",
+                inputs={"Ids": indices,
+                        "W": weight},
+                outputs={"Out": out},
+                attrs={
+                    "is_sparse": self._is_sparse,
+                    "is_distributed": self._is_distributed,
+                    "remote_prefetch": self._remote_prefetch,
+                    "padding_idx":
+                    self._padding_idx,  # special value for lookup table op
+                })
+            return out
+
+        elif np.prod(speaker_position_rate.shape) > 1:
+            assert speaker_position_rate.shape == [batch_size, 1]
+            outputs = []
+            for i in range(batch_size):
+                rate = speaker_position_rate[i]  # rate has shape [1]
+                scaled_rad = fluid.layers.elementwise_mul(rad, rate)
+                weight = compute_position_embedding(scaled_rad)
+                out = self._helper.create_variable_for_type_inference(
+                    self._dtype)
+                sequence = indices[i]
+                self._helper.append_op(
+                    type="lookup_table",
+                    inputs={"Ids": sequence,
+                            "W": weight},
+                    outputs={"Out": out},
+                    attrs={
+                        "is_sparse": self._is_sparse,
+                        "is_distributed": self._is_distributed,
+                        "remote_prefetch": self._remote_prefetch,
+                        "padding_idx": -1,
+                    })
+                outputs.append(out)
+            out = fluid.layers.stack(outputs)
+            return out
+        else:
+            raise Exception("Then you can just use position rate at init")
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
@ -0,0 +1,863 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+from six.moves import reduce
+
+from copy import deepcopy
+
+import paddle
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+from paddle.fluid import core
+from paddle.fluid.layers import utils
+from paddle.fluid.framework import Variable
+from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
+
+
+def _norm(p, dim):
+    """Computes the norm over all dimensions except dim.
+    It differs from pytorch implementation that it does not keep dim.
+    This difference is related with the broadcast mechanism in paddle.
+    Read elementeise_mul for more.
+    """
+
+    if dim is None:
+        return np.linalg.norm(p, ord=2, axis=None)
+    elif dim == 0:
+        p = np.reshape(p, newshape=(p.shape[0], -1))
+        return np.linalg.norm(p, ord=2, axis=1)
+    elif dim == p.ndim - 1:
+        p = np.reshape(p, newshape=(-1, p.shape[-1]))
+        return np.linalg.norm(p, ord=2, axis=0)
+    else:
+        perm = list(range(p.ndim))
+        perm[0] = dim
+        perm[dim] = 0
+        return _norm(np.transpose(p, axes=perm))
+
+
+class FC(dg.Layer):
+    """
+    **Fully Connected Layer**
+
+    This function creates a fully connected layer in the network. It can take
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a pair of variables called (magnitude(g), 
+    direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected 
+    weight matrix from each input unit to each output unit. 
+    The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+
+    When the input is single tensor:
+
+    .. math::
+
+        Out = Act({X(normalize(V)g) + b})
+
+    When the input are multiple tensors:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
+    * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+
+    Args:
+        name_scope(str): The name of this class.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
+        param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str|None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase. Default: False
+        dtype(str): Dtype used for weight
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph import FC
+          import numpy as np
+
+          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
+          with fluid.dygraph.guard():
+              fc = FC( "fc", 64, num_flatten_dims=2)
+              data = to_variable( data )
+              conv = fc( data )
+
+    """
+
+    def __init__(self,
+                 name_scope,
+                 size,
+                 num_flatten_dims=1,
+                 epsilon=1e-30,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 is_test=False,
+                 dtype="float32"):
+        super(FC, self).__init__(name_scope, dtype)
+
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._epsilon = epsilon
+        self._dtype = dtype
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+        self.__g = list()
+        self.__v = list()
+
+    @property
+    def _v(self, i=0):
+        return self.__v[i]
+
+    @property
+    def _g(self, i=0):
+        return self.__g[i]
+
+    @_v.setter
+    def _v(self, value, i=0):
+        assert isinstance(value, Parameter)
+        self.__v[i] = value
+
+    @_g.setter
+    def _g(self, value, i=0):
+        assert isinstance(value, Parameter)
+        self.__g[i] = value
+
+    def _build_once(self, input):
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
+            ] + [self._size]
+            self.__v.append(
+                self.add_parameter(
+                    "_v%d" % i,
+                    self.create_parameter(
+                        attr=param,
+                        shape=param_shape,
+                        dtype=self._dtype,
+                        is_bias=False)))
+
+            magnitude_shape = param_shape[1:]
+            magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0)
+
+            self.__g.append(
+                self.add_parameter(
+                    "_g%d" % i,
+                    self.create_parameter(
+                        attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.NumpyArrayInitializer(
+                                magnitude_value)),
+                        shape=magnitude_shape,
+                        dtype=self._dtype,
+                        is_bias=False)))
+            i += 1
+
+        size = list([self._size])
+        self._b = self.create_parameter(
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+
+    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            v_norm = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            v_normalized = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="norm",
+                inputs={"X": self.__v[i]},
+                outputs={"Out": v_normalized,
+                         "Norm": v_norm},
+                attrs={"axis": 0,
+                       "epsilon": self._epsilon})
+            weight = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="elementwise_mul",
+                inputs={"X": [v_normalized],
+                        "Y": [self.__g[i]]},
+                outputs={"Out": [weight]},
+                attrs={"axis": 1})
+            tmp = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="mul",
+                inputs={"X": inp,
+                        "Y": weight},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": self._num_flatten_dims,
+                    "y_num_col_dims": 1
+                })
+            i += 1
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias},
+                attrs={"use_mkldnn": False})
+
+        if self._b:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type="elementwise_add",
+                inputs={"X": [pre_bias],
+                        "Y": [self._b]},
+                outputs={"Out": [pre_activation]},
+                attrs={"axis": self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class Conv2D(dg.Layer):
+    """
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
+    for more detials.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma ((Vg) \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`V`: Filter direction value, a tensor with MCHW format.
+    * :math:`g`: Filter magnitude value, a tensor with M format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        name_scope(str) : The name for this class.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph import Conv2D
+          import numpy as np
+
+          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
+          with fluid.dygraph.guard():
+              conv2d = Conv2D( "conv2d", 2, 3)
+              data = to_variable( data )
+              conv = conv2d( data )
+
+    """
+
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 epsilon=1e-30,
+                 dtype="float32"):
+        assert param_attr is not False, "param_attr should not be False here."
+        super(Conv2D, self).__init__(name_scope, dtype)
+        self._groups = groups
+        self._stride = utils.convert_to_list(stride, 2, "stride")
+        self._padding = utils.convert_to_list(padding, 2, "padding")
+        self._dilation = utils.convert_to_list(dilation, 2, "dilation")
+        self._act = act
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        self._use_cudnn = use_cudnn
+        self._filter_size = filter_size
+        self._num_filters = num_filters
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._dtype = dtype
+        # if (self._num_channels == self._groups and
+        #         num_filters % self._num_channels == 0 and not self._use_cudnn):
+        #     self._l_type = 'depthwise_conv2d'
+        # else:
+        # TODO(jiabin): recover the usage of depthwise_conv2d when it's
+        #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
+        self._l_type = "conv2d"
+
+    def _build_once(self, input):
+        self._num_channels = input.shape[1]
+        if self._groups is None:
+            num_filter_channels = self._num_channels
+        else:
+            if self._num_channels % self._groups != 0:
+                raise ValueError("num_channels must be divisible by groups.")
+            num_filter_channels = self._num_channels // self._groups
+        filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size")
+        filter_shape = [self._num_filters, int(num_filter_channels)
+                        ] + filter_size
+
+        def _get_default_param_initializer():
+            filter_elem_num = filter_size[0] * filter_size[
+                1] * self._num_channels
+            std = (2.0 / filter_elem_num)**0.5
+            return Normal(0.0, std, 0)
+
+        # weight_v
+        self._filter_param_v = self.create_parameter(
+            attr=self._param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            default_initializer=_get_default_param_initializer())
+
+        # weight_g
+        norm_value = _norm(
+            self._filter_param_v.numpy(), dim=0)  # CAUTION: hard-code
+        self._filter_param_g = self.create_parameter(
+            attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    norm_value)),
+            shape=norm_value.shape,
+            dtype=self._dtype,
+            default_initializer=_get_default_param_initializer())
+
+        if self._use_cudnn:
+            self.create_variable(
+                name="kCUDNNFwdAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            self.create_variable(
+                name="kCUDNNBwdDataAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            self.create_variable(
+                name="kCUDNNBwdFilterAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+
+        self._bias_param = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[self._num_filters],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, input):
+        matrix = self._helper.create_variable_for_type_inference(self._dtype)
+        tmp = self._helper.create_variable_for_type_inference(self._dtype)
+        new_shape = [
+            self._filter_param_v.shape[0],
+            reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
+        ]
+
+        self._helper.append_op(
+            type="reshape2",
+            inputs={"X": self._filter_param_v},
+            attrs={"shape": new_shape},
+            outputs={"Out": matrix,
+                     "XShape": tmp})
+
+        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
+        m_normalized = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="norm",
+            inputs={"X": matrix},
+            outputs={"Out": m_normalized,
+                     "Norm": m_norm},
+            attrs={"axis": 1,
+                   "epsilon": self._epsilon})
+
+        v_normalized = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="reshape2",
+            inputs={"X": m_normalized},
+            attrs={"shape": self._filter_param_v.shape},
+            outputs={"Out": v_normalized,
+                     "XShape": tmp2})
+
+        filter_param = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="elementwise_mul",
+            inputs={"X": [v_normalized],
+                    "Y": [self._filter_param_g]},
+            outputs={"Out": [filter_param]},
+            attrs={"axis": 0},  # CAUTION: hard-code
+        )
+
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type=self._l_type,
+            inputs={"Input": input,
+                    "Filter": filter_param},
+            outputs={"Output": pre_bias},
+            attrs={
+                "strides": self._stride,
+                "paddings": self._padding,
+                "dilations": self._dilation,
+                "groups": self._groups if self._groups else 1,
+                "use_cudnn": self._use_cudnn,
+                "use_mkldnn": False,
+            })
+
+        if self._bias_param is not None:
+            pre_act = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type="elementwise_add",
+                inputs={"X": [pre_bias],
+                        "Y": [self._bias_param]},
+                outputs={"Out": [pre_act]},
+                attrs={"axis": 1})
+        else:
+            pre_act = pre_bias
+
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(pre_act, act=self._act)
+
+
+class Conv2DTranspose(dg.Layer):
+    """
+    **Convlution2D transpose layer**
+
+    The convolution2D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+    Parameters(dilations, strides, paddings) are two elements. These two elements
+    represent height and width, respectively. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma ((Vg) \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`V`: Filter value, a tensor with MCHW format.
+    * :math:`g`: Filter value, a tensor with M format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+
+    Args:
+        name_scope(str): The name of this class.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). None if use
+            filter_size, padding, and stride to calculate output_size.
+            if output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size. Default: None.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups = 1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          import paddle.fluid as fluid
+          import numpy
+
+          with fluid.dygraph.guard():
+              data = numpy.random.random((3, 32, 32)).astype('float32')
+              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
+                    'Conv2DTranspose', num_filters=2, filter_size=3)
+              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
+
+    """
+
+    def __init__(self,
+                 name_scope,
+                 num_filters,
+                 output_size=None,
+                 filter_size=None,
+                 padding=0,
+                 stride=1,
+                 dilation=1,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 epsilon=1e-30,
+                 act=None,
+                 dtype="float32"):
+        super(Conv2DTranspose, self).__init__(name_scope, dtype)
+        assert (param_attr is not False
+                ), "param_attr should not be False in conv2d_transpose."
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._groups = groups
+        self._num_filters = num_filters
+        self._use_cudnn = use_cudnn
+        self._padding = padding
+        self._stride = stride
+        self._dilation = dilation
+        self._filter_size = filter_size
+        self._output_size = output_size
+        self._op_type = "conv2d_transpose"
+        self._epsilon = epsilon
+
+    def _build_once(self, input):
+        input_channel = input.shape[1]
+        if (input_channel == self._groups and
+                self._num_filters == input_channel and not self._use_cudnn):
+            self._op_type = "depthwise_conv2d_transpose"
+
+        if not isinstance(input, Variable):
+            raise TypeError("Input of conv2d_transpose must be Variable")
+
+        self._padding = utils.convert_to_list(self._padding, 2, "padding")
+        self._stride = utils.convert_to_list(self._stride, 2, "stride")
+        self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
+
+        if not isinstance(self._use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+
+        if self._filter_size is None:
+            if self._output_size is None:
+                raise ValueError(
+                    "output_size must be set when filter_size is None")
+            if isinstance(self._output_size, int):
+                self._output_size = [self._output_size, self._output_size]
+
+            h_in = input.shape[2]
+            w_in = input.shape[3]
+
+            filter_size_h = (self._output_size[0] -
+                             (h_in - 1) * self._stride[0] + 2 * self._padding[0]
+                             - 1) // self._dilation[0] + 1
+            filter_size_w = (self._output_size[1] -
+                             (w_in - 1) * self._stride[1] + 2 * self._padding[1]
+                             - 1) // self._dilation[1] + 1
+            self._filter_size = [filter_size_h, filter_size_w]
+        else:
+            self._filter_size = utils.convert_to_list(
+                self._filter_size, 2, "conv2d_transpose.filter_size")
+
+        if self._output_size is None:
+            self._output_size = []
+        elif isinstance(self._output_size, list) or isinstance(
+                self._output_size, int):
+            self._output_size = utils.convert_to_list(self._output_size, 2,
+                                                      "output_size")
+        else:
+            raise ValueError("output_size should be list or int")
+        self._padding = utils.convert_to_list(self._padding, 2, "padding")
+        self._groups = 1 if self._groups is None else self._groups
+        filter_shape = [
+            input_channel,
+            self._num_filters // self._groups,
+        ] + self._filter_size
+
+        # img filter v (direction)
+        self._img_filter_v = self.create_parameter(
+            dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
+
+        # img filter g (magnitude)
+        img_filter_magnitude = _norm(
+            self._img_filter_v.numpy(), dim=0)  # CAUTION: hard-code
+        self._img_filter_g = self.create_parameter(
+            dtype=input.dtype,
+            shape=img_filter_magnitude.shape,
+            attr=fluid.ParamAttr(
+                initializer=NumpyArrayInitializer(img_filter_magnitude)))
+
+        self._img_bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[self._num_filters],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, input):
+        matrix = self._helper.create_variable_for_type_inference(self._dtype)
+        tmp = self._helper.create_variable_for_type_inference(self._dtype)
+        new_shape = [
+            self._img_filter_v.shape[0],
+            reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
+        ]
+
+        self._helper.append_op(
+            type="reshape2",
+            inputs={"X": self._img_filter_v},
+            attrs={"shape": new_shape},
+            outputs={"Out": matrix,
+                     "XShape": tmp})
+
+        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
+        m_normalized = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="norm",
+            inputs={"X": matrix},
+            outputs={"Out": m_normalized,
+                     "Norm": m_norm},
+            attrs={"axis": 1,
+                   "epsilon": self._epsilon})
+
+        v_normalized = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="reshape2",
+            inputs={"X": m_normalized},
+            attrs={"shape": self._img_filter_v.shape},
+            outputs={"Out": v_normalized,
+                     "XShape": tmp2})
+
+        img_filter = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type="elementwise_mul",
+            inputs={"X": [v_normalized],
+                    "Y": [self._img_filter_g]},
+            outputs={"Out": [img_filter]},
+            attrs={"axis": 0},  # CAUTION: hard-code
+        )
+
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+        self._helper.append_op(
+            type=self._op_type,
+            inputs={"Input": [input],
+                    "Filter": [img_filter]},
+            outputs={"Output": pre_bias},
+            attrs={
+                "output_size": self._output_size,
+                "strides": self._stride,
+                "paddings": self._padding,
+                "dilations": self._dilation,
+                "groups": self._groups,
+                "use_cudnn": self._use_cudnn,
+            })
+
+        if self._img_bias is not None:
+            pre_act = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type="elementwise_add",
+                inputs={"X": [pre_bias],
+                        "Y": [self._img_bias]},
+                outputs={"Out": [pre_act]},
+                attrs={"axis": 1})
+        else:
+            pre_act = pre_bias
+
+        out = self._helper.append_activation(pre_act)
+        return out