add models & modules back

2019-11-25 03:40:52 +00:00 · 2019-11-25 03:40:52 +00:00 · de4c5d4f49
parent a715e6029d
commit de4c5d4f49
24 changed files with 1701 additions and 0 deletions
--- a/parakeet/models/deepvoice3/README.md
+++ b/parakeet/models/deepvoice3/README.md
--- a/parakeet/models/deepvoice3/README_cn.md
+++ b/parakeet/models/deepvoice3/README_cn.md
--- a/parakeet/models/deepvoice3/_ce.py
+++ b/parakeet/models/deepvoice3/_ce.py
--- a/parakeet/models/deepvoice3/_images/model_architecture.png
+++ b/parakeet/models/deepvoice3/_images/model_architecture.png
--- a/parakeet/models/deepvoice3/audio.py
+++ b/parakeet/models/deepvoice3/audio.py
--- a/parakeet/models/deepvoice3/builder.py
+++ b/parakeet/models/deepvoice3/builder.py
--- a/parakeet/models/deepvoice3/compute_timestamp_ratio.py
+++ b/parakeet/models/deepvoice3/compute_timestamp_ratio.py
--- a/parakeet/models/deepvoice3/data.py
+++ b/parakeet/models/deepvoice3/data.py
--- a/parakeet/models/deepvoice3/deepvoice3.py
+++ b/parakeet/models/deepvoice3/deepvoice3.py
--- a/parakeet/models/deepvoice3/dry_run.py
+++ b/parakeet/models/deepvoice3/dry_run.py
--- a/parakeet/models/deepvoice3/eval_model.py
+++ b/parakeet/models/deepvoice3/eval_model.py
--- a/parakeet/models/deepvoice3/hparams.py
+++ b/parakeet/models/deepvoice3/hparams.py
--- a/parakeet/models/deepvoice3/ljspeech.py
+++ b/parakeet/models/deepvoice3/ljspeech.py
--- a/parakeet/models/deepvoice3/preprocess.py
+++ b/parakeet/models/deepvoice3/preprocess.py
--- a/parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json
+++ b/parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json
--- a/parakeet/models/deepvoice3/synthesis.py
+++ b/parakeet/models/deepvoice3/synthesis.py
--- a/parakeet/models/deepvoice3/train.py
+++ b/parakeet/models/deepvoice3/train.py
--- a/parakeet/models/deepvoice3/train.sh
+++ b/parakeet/models/deepvoice3/train.sh
--- a/parakeet/models/deepvoice3/train_model.py
+++ b/parakeet/models/deepvoice3/train_model.py
--- a/parakeet/modules/init.py
+++ b/parakeet/modules/init.py
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
@ -0,0 +1,222 @@
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 import paddle
 from paddle import fluid
 import paddle.fluid.dygraph as dg
 from weight_norm import Conv2D, Conv2DTranspose
 class Conv1D(dg.Layer):
    """
    A convolution 1D block implemented with Conv2D. Form simplicity and 
    ensuring the output has the same length as the input, it does not allow 
    stride > 1.
    """
    def __init__(self,
                 name_scope,
                 in_cahnnels,
                 num_filters,
                 filter_size=3,
                 dilation=1,
                 groups=None,
                 causal=False,
                 param_attr=None,
                 bias_attr=None,
                 use_cudnn=True,
                 act=None,
                 dtype="float32"):
        super(Conv1D, self).__init__(name_scope, dtype=dtype)
        if causal:
            padding = dilation * (filter_size - 1)
        else:
            padding = (dilation * (filter_size - 1)) // 2
        self.in_channels = in_cahnnels
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.dilation = dilation
        self.causal = causal
        self.padding = padding
        self.act = act
        self.conv = Conv2D(
            self.full_name(),
            num_filters=num_filters,
            filter_size=(1, filter_size),
            stride=(1, 1),
            dilation=(1, dilation),
            padding=(0, padding),
            groups=groups,
            param_attr=param_attr,
            bias_attr=bias_attr,
            use_cudnn=use_cudnn,
            act=act,
            dtype=dtype)
    def forward(self, x):
        """
        Args:
            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
                input channels.
        Returns:
            x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
                output channels (num_filters).
        """
        x = self.conv(x)
        if self.filter_size > 1:
            if self.causal:
                x = fluid.layers.slice(
                    x, axes=[3], starts=[0], ends=[-self.padding])
            elif self.filter_size % 2 == 0:
                x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
        return x
    def start_new_sequence(self):
        self.temp_weight = None
        self.input_buffer = None
    def add_input(self, x):
        """
        Adding input for a time step and compute an output for a time step.
        Args:
            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
                input channels, and T = 1.
        Returns:
            out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
            means output channels (num_filters), and T = 1.
        """
        if self.temp_weight is None:
            self.temp_weight = self._reshaped_weight()
        window_size = 1 + (self.filter_size - 1) * self.dilation
        batch_size = x.shape[0]
        in_channels = x.shape[1]
        if self.filter_size > 1:
            if self.input_buffer is None:
                self.input_buffer = fluid.layers.fill_constant(
                    [batch_size, in_channels, 1, window_size - 1],
                    dtype=x.dtype,
                    value=0.0)
            else:
                self.input_buffer = self.input_buffer[:, :, :, 1:]
            self.input_buffer = fluid.layers.concat(
                [self.input_buffer, x], axis=3)
            x = self.input_buffer
            if self.dilation > 1:
                if not hasattr(self, "indices"):
                    self.indices = dg.to_variable(
                        np.arange(0, window_size, self.dilation))
                tmp = fluid.layers.transpose(
                    self.input_buffer, perm=[3, 1, 2, 0])
                tmp = fluid.layers.gather(tmp, index=self.indices)
                tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
                x = tmp
        inputs = fluid.layers.reshape(
            x, shape=[batch_size, in_channels * 1 * self.filter_size])
        out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
        out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
        out = fluid.layers.reshape(out, out.shape + [1, 1])
        out = self._helper.append_activation(out, act=self.act)
        return out
    def _reshaped_weight(self):
        """
        Get the linearized weight of convolution filter, cause it is by nature 
        a matmul weight. And because the model uses weight norm, compute the
        weight by weight_v * weight_g to make it faster.
        Returns:
            weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
        """
        shape = self.conv._filter_param_v.shape
        matrix_shape = [shape[0], np.prod(shape[1:])]
        weight_matrix = fluid.layers.reshape(
            self.conv._filter_param_v, shape=matrix_shape)
        weight_matrix = fluid.layers.elementwise_mul(
            fluid.layers.l2_normalize(
                weight_matrix, axis=1),
            self.conv._filter_param_g,
            axis=0)
        return weight_matrix
 class Conv1DTranspose(dg.Layer):
    """
    A convolutional transpose 1D block implemented with convolutional transpose
    2D. It does not ensure that the output is exactly expanded stride times in 
    time dimension.
    """
    def __init__(self,
                 name_scope,
                 in_channels,
                 num_filters,
                 filter_size,
                 padding=0,
                 stride=1,
                 dilation=1,
                 groups=None,
                 param_attr=None,
                 bias_attr=None,
                 use_cudnn=True,
                 act=None,
                 dtype="float32"):
        super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype)
        self.in_channels = in_channels
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.padding = padding
        self.stride = stride
        self.dilation = dilation
        self.groups = groups
        self.conv_transpose = Conv2DTranspose(
            self.full_name(),
            num_filters,
            filter_size=(1, filter_size),
            padding=(0, padding),
            stride=(1, stride),
            dilation=(1, dilation),
            groups=groups,
            param_attr=param_attr,
            bias_attr=bias_attr,
            use_cudnn=use_cudnn,
            act=act,
            dtype=dtype)
    def forward(self, x):
        """
        Argss:
            x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input
                channels and T_in means the number of time steps of input.
        Returns:
            out (Variable): shape(B, C_out, 1, T_out), where C_out means the
                output channels and T_out means the number of time steps of
                input.
        """
        return self.conv_transpose(x)
--- a/parakeet/modules/loss.py
+++ b/parakeet/modules/loss.py
@ -0,0 +1,158 @@
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 from numba import jit
 from paddle import fluid
 import paddle.fluid.dygraph as dg
 def masked_mean(inputs, mask):
    """
    Args:
        inputs (Variable): Shape(B, C, 1, T), the input, where B means
            batch size, C means channels of input, T means timesteps of
            the input.
        mask (Variable): Shape(B, T), a mask. 
    Returns:
        loss (Variable): Shape(1, ), masked mean.
    """
    channels = inputs.shape[1]
    reshaped_mask = fluid.layers.reshape(
        mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
    expanded_mask = fluid.layers.expand(
        reshaped_mask, expand_times=[1, channels, 1, 1])
    expanded_mask.stop_gradient = True
    valid_cnt = fluid.layers.reduce_sum(expanded_mask)
    valid_cnt.stop_gradient = True
    masked_inputs = inputs * expanded_mask
    loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
    return loss
@jit(nopython=True)
 def guided_attention(N, max_N, T, max_T, g):
    W = np.zeros((max_N, max_T), dtype=np.float32)
    for n in range(N):
        for t in range(T):
            W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
    return W
 def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
    B = len(input_lengths)
    max_input_len = input_lengths.max()
    W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
    for b in range(B):
        W[b] = guided_attention(input_lengths[b], max_input_len,
                                target_lengths[b], max_target_len, g).T
    return W
 class TTSLoss(object):
    def __init__(self,
                 masked_weight=0.0,
                 priority_weight=0.0,
                 binary_divergence_weight=0.0,
                 guided_attention_sigma=0.2):
        self.masked_weight = masked_weight
        self.priority_weight = priority_weight
        self.binary_divergence_weight = binary_divergence_weight
        self.guided_attention_sigma = guided_attention_sigma
    def l1_loss(self, prediction, target, mask, priority_bin=None):
        abs_diff = fluid.layers.abs(prediction - target)
        # basic mask-weighted l1 loss
        w = self.masked_weight
        if w > 0 and mask is not None:
            base_l1_loss = w * masked_mean(abs_diff, mask) + (
                1 - w) * fluid.layers.reduce_mean(abs_diff)
        else:
            base_l1_loss = fluid.layers.reduce_mean(abs_diff)
        if self.priority_weight > 0 and priority_bin is not None:
            # mask-weighted priority channels' l1-loss
            priority_abs_diff = fluid.layers.slice(
                abs_diff, axes=[1], starts=[0], ends=[priority_bin])
            if w > 0 and mask is not None:
                priority_loss = w * masked_mean(priority_abs_diff, mask) + (
                    1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
            else:
                priority_loss = fluid.layers.reduce_mean(priority_abs_diff)
            # priority weighted sum
            p = self.priority_weight
            loss = p * priority_loss + (1 - p) * base_l1_loss
        else:
            loss = base_l1_loss
        return loss
    def binary_divergence(self, prediction, target, mask):
        flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
        flattened_target = fluid.layers.reshape(target, [-1, 1])
        flattened_loss = fluid.layers.log_loss(
            flattened_prediction, flattened_target, epsilon=1e-8)
        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
        w = self.masked_weight
        if w > 0 and mask is not None:
            loss = w * masked_mean(bin_div, mask) + (
                1 - w) * fluid.layers.reduce_mean(bin_div)
        else:
            loss = fluid.layers.reduce_mean(bin_div)
        return loss
    @staticmethod
    def done_loss(done_hat, done):
        flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
        flat_done = fluid.layers.reshape(done, [-1, 1])
        loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
        loss = fluid.layers.reduce_mean(loss)
        return loss
    def attention_loss(self, predicted_attention, input_lengths,
                       target_lengths):
        """
        Given valid encoder_lengths and decoder_lengths, compute a diagonal 
        guide, and compute loss from the predicted attention and the guide.
        Args:
            predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the 
                alignment tensor, where B means batch size, T_dec means number
                of time steps of the decoder, T_enc means the number of time
                steps of the encoder, * means other possible dimensions.
            input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
                (time steps) of encoder outputs.
            target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64, 
                valid lengths (time steps) of decoder outputs.
        Returns:
            loss (Variable): Shape(1, ) attention loss.
        """
        n_attention, batch_size, max_target_len, max_input_len = (
            predicted_attention.shape)
        soft_mask = guided_attentions(input_lengths, target_lengths,
                                      max_target_len,
                                      self.guided_attention_sigma)
        soft_mask_ = dg.to_variable(soft_mask)
        loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
        return loss
--- a/parakeet/modules/modules.py
+++ b/parakeet/modules/modules.py
@ -0,0 +1,458 @@
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import fluid
 import paddle.fluid.dygraph as dg
 import numpy as np
 import conv
 import weight_norm as weight_norm
 def FC(name_scope,
       in_features,
       size,
       num_flatten_dims=1,
       dropout=0.0,
       epsilon=1e-30,
       act=None,
       is_test=False,
       dtype="float32"):
    """
    A special Linear Layer, when it is used with dropout, the weight is 
    initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
    """
    # stds
    if isinstance(in_features, int):
        in_features = [in_features]
    stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
    weight_inits = [
        fluid.initializer.NormalInitializer(scale=std) for std in stds
    ]
    bias_init = fluid.initializer.ConstantInitializer(0.0)
    # param attrs
    weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
    bias_attr = fluid.ParamAttr(initializer=bias_init)
    layer = weight_norm.FC(name_scope,
                           size,
                           num_flatten_dims=num_flatten_dims,
                           param_attr=weight_attrs,
                           bias_attr=bias_attr,
                           act=act,
                           dtype=dtype)
    return layer
 def Conv1D(name_scope,
           in_channels,
           num_filters,
           filter_size=3,
           dilation=1,
           groups=None,
           causal=False,
           std_mul=1.0,
           dropout=0.0,
           use_cudnn=True,
           act=None,
           dtype="float32"):
    """
    A special Conv1D Layer, when it is used with dropout, the weight is 
    initialized as 
    normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
    """
    # std
    std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
    weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
    bias_init = fluid.initializer.ConstantInitializer(0.0)
    # param attrs
    weight_attr = fluid.ParamAttr(initializer=weight_init)
    bias_attr = fluid.ParamAttr(initializer=bias_init)
    layer = conv.Conv1D(
        name_scope,
        in_channels,
        num_filters,
        filter_size,
        dilation,
        groups=groups,
        causal=causal,
        param_attr=weight_attr,
        bias_attr=bias_attr,
        use_cudnn=use_cudnn,
        act=act,
        dtype=dtype)
    return layer
 def Embedding(name_scope,
              num_embeddings,
              embed_dim,
              is_sparse=False,
              is_distributed=False,
              padding_idx=None,
              std=0.01,
              dtype="float32"):
    # param attrs
    weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
        scale=std))
    layer = dg.Embedding(
        name_scope, (num_embeddings, embed_dim),
        padding_idx=padding_idx,
        param_attr=weight_attr,
        dtype=dtype)
    return layer
 class Conv1DGLU(dg.Layer):
    """
    A Convolution 1D block with GLU activation. It also applys dropout for the 
    input x. It fuses speaker embeddings through a FC activated by softsign. It
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """
    def __init__(self,
                 name_scope,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 num_filters,
                 filter_size,
                 dilation,
                 std_mul=4.0,
                 dropout=0.0,
                 causal=False,
                 residual=True,
                 dtype="float32"):
        super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
        # conv spec
        self.in_channels = in_channels
        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.dilation = dilation
        self.causal = causal
        self.residual = residual
        # weight init and dropout
        self.std_mul = std_mul
        self.dropout = dropout
        if residual:
            assert (
                in_channels == num_filters
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
        self.conv = Conv1D(
            self.full_name(),
            in_channels,
            2 * num_filters,
            filter_size,
            dilation,
            causal=causal,
            std_mul=std_mul,
            dropout=dropout,
            dtype=dtype)
        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            self.fc = Conv1D(
                self.full_name(),
                speaker_dim,
                num_filters,
                filter_size=1,
                dilation=1,
                causal=False,
                act="softsign",
                dtype=dtype)
    def forward(self, x, speaker_embed_bc1t=None):
        """
        Args:
            x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
                layer, where B means batch_size, C_in means the input channels
                T means input time steps.
            speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
                speaker embed, where C_sp means speaker embedding size. Note
                that when using residual connection, the Conv1DGLU does not
                change the number of channels, so out channels equals input
                channels.
        Returns:
            x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
                C_out means the output channels of Conv1DGLU.
        """
        residual = x
        x = fluid.layers.dropout(
            x, self.dropout, dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
        if speaker_embed_bc1t is not None:
            sp = self.fc(speaker_embed_bc1t)
            content = content + sp
        # glu
        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
        if self.residual:
            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
        return x
    def add_input(self, x, speaker_embed_bc11=None):
        """
        Inputs:
        x: shape(B, num_filters, 1, time_steps)
        speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
        Outputs:
        out: shape(B, num_filters, 1, time_steps), where time_steps = 1
        """
        residual = x
        # add step input and produce step output
        x = fluid.layers.dropout(
            x, self.dropout, dropout_implementation="upscale_in_train")
        x = self.conv.add_input(x)
        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
        if speaker_embed_bc11 is not None:
            sp = self.fc(speaker_embed_bc11)
            content = content + sp
        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
        if self.residual:
            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
        return x
 def Conv1DTranspose(name_scope,
                    in_channels,
                    num_filters,
                    filter_size,
                    padding=0,
                    stride=1,
                    dilation=1,
                    groups=None,
                    std_mul=1.0,
                    dropout=0.0,
                    use_cudnn=True,
                    act=None,
                    dtype="float32"):
    std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
    weight_init = fluid.initializer.NormalInitializer(scale=std)
    weight_attr = fluid.ParamAttr(initializer=weight_init)
    bias_init = fluid.initializer.ConstantInitializer(0.0)
    bias_attr = fluid.ParamAttr(initializer=bias_init)
    layer = conv.Conv1DTranspose(
        name_scope,
        in_channels,
        num_filters,
        filter_size,
        padding=padding,
        stride=stride,
        dilation=dilation,
        groups=groups,
        param_attr=weight_attr,
        bias_attr=bias_attr,
        use_cudnn=use_cudnn,
        act=act,
        dtype=dtype)
    return layer
 def compute_position_embedding(rad):
    # rad is a transposed radius, shape(embed_dim, n_vocab)
    embed_dim, n_vocab = rad.shape
    even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
    odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
    even_rads = fluid.layers.gather(rad, even_dims)
    odd_rads = fluid.layers.gather(rad, odd_dims)
    sines = fluid.layers.sin(even_rads)
    cosines = fluid.layers.cos(odd_rads)
    temp = fluid.layers.scatter(rad, even_dims, sines)
    out = fluid.layers.scatter(temp, odd_dims, cosines)
    out = fluid.layers.transpose(out, perm=[1, 0])
    return out
 def position_encoding_init(n_position,
                           d_pos_vec,
                           position_rate=1.0,
                           sinusoidal=True):
    """ Init the sinusoid position encoding table """
    # keep idx 0 for padding token position encoding zero vector
    position_enc = np.array([[
        position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
        for i in range(d_pos_vec)
    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
    if sinusoidal:
        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return position_enc
 class PositionEmbedding(dg.Layer):
    def __init__(self,
                 name_scope,
                 n_position,
                 d_pos_vec,
                 position_rate=1.0,
                 is_sparse=False,
                 is_distributed=False,
                 param_attr=None,
                 max_norm=None,
                 padding_idx=None,
                 dtype="float32"):
        super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
        self.embed = dg.Embedding(
            self.full_name(),
            size=(n_position, d_pos_vec),
            is_sparse=is_sparse,
            is_distributed=is_distributed,
            padding_idx=None,
            param_attr=param_attr,
            dtype=dtype)
        self.set_weight(
            position_encoding_init(
                n_position,
                d_pos_vec,
                position_rate=position_rate,
                sinusoidal=False).astype(dtype))
        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
        if self._remote_prefetch:
            assert self._is_sparse is True and self._is_distributed is False
        self._padding_idx = (-1 if padding_idx is None else padding_idx if
                             padding_idx >= 0 else (n_position + padding_idx))
        self._position_rate = position_rate
        self._max_norm = max_norm
        self._dtype = dtype
    def set_weight(self, array):
        assert self.embed._w.shape == list(array.shape), "shape does not match"
        self.embed._w._ivar.value().get_tensor().set(
            array, fluid.framework._current_expected_place())
    def forward(self, indices, speaker_position_rate=None):
        """
        Args:
            indices (Variable): Shape (B, T, 1), dtype: int64, position
                indices, where B means the batch size, T means the time steps.
            speaker_position_rate (Variable | float, optional), position
                rate. It can be a float point number or a Variable with 
                shape (1,), then this speaker_position_rate is used for every 
                example. It can also be a Variable with shape (B, 1), which 
                contains a speaker position rate for each speaker.
        Returns:
            out (Variable): Shape(B, C_pos), position embedding, where C_pos 
                means position embedding size.
        """
        rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
        batch_size = indices.shape[0]
        if speaker_position_rate is None:
            weight = compute_position_embedding(rad)
            out = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="lookup_table",
                inputs={"Ids": indices,
                        "W": weight},
                outputs={"Out": out},
                attrs={
                    "is_sparse": self._is_sparse,
                    "is_distributed": self._is_distributed,
                    "remote_prefetch": self._remote_prefetch,
                    "padding_idx":
                    self._padding_idx,  # special value for lookup table op
                })
            return out
        elif (np.isscalar(speaker_position_rate) or
              isinstance(speaker_position_rate, fluid.framework.Variable) and
              speaker_position_rate.shape == [1, 1]):
            # # make a weight
            # scale the weight (the operand for sin & cos)
            if np.isscalar(speaker_position_rate):
                scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
            else:
                scaled_rad = fluid.layers.elementwise_mul(
                    rad, speaker_position_rate[0])
            weight = compute_position_embedding(scaled_rad)
            out = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="lookup_table",
                inputs={"Ids": indices,
                        "W": weight},
                outputs={"Out": out},
                attrs={
                    "is_sparse": self._is_sparse,
                    "is_distributed": self._is_distributed,
                    "remote_prefetch": self._remote_prefetch,
                    "padding_idx":
                    self._padding_idx,  # special value for lookup table op
                })
            return out
        elif np.prod(speaker_position_rate.shape) > 1:
            assert speaker_position_rate.shape == [batch_size, 1]
            outputs = []
            for i in range(batch_size):
                rate = speaker_position_rate[i]  # rate has shape [1]
                scaled_rad = fluid.layers.elementwise_mul(rad, rate)
                weight = compute_position_embedding(scaled_rad)
                out = self._helper.create_variable_for_type_inference(
                    self._dtype)
                sequence = indices[i]
                self._helper.append_op(
                    type="lookup_table",
                    inputs={"Ids": sequence,
                            "W": weight},
                    outputs={"Out": out},
                    attrs={
                        "is_sparse": self._is_sparse,
                        "is_distributed": self._is_distributed,
                        "remote_prefetch": self._remote_prefetch,
                        "padding_idx": -1,
                    })
                outputs.append(out)
            out = fluid.layers.stack(outputs)
            return out
        else:
            raise Exception("Then you can just use position rate at init")
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
@ -0,0 +1,863 @@
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 from six.moves import reduce
 from copy import deepcopy
 import paddle
 from paddle import fluid
 import paddle.fluid.dygraph as dg
 from paddle.fluid import core
 from paddle.fluid.layers import utils
 from paddle.fluid.framework import Variable
 from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 def _norm(p, dim):
    """Computes the norm over all dimensions except dim.
    It differs from pytorch implementation that it does not keep dim.
    This difference is related with the broadcast mechanism in paddle.
    Read elementeise_mul for more.
    """
    if dim is None:
        return np.linalg.norm(p, ord=2, axis=None)
    elif dim == 0:
        p = np.reshape(p, newshape=(p.shape[0], -1))
        return np.linalg.norm(p, ord=2, axis=1)
    elif dim == p.ndim - 1:
        p = np.reshape(p, newshape=(-1, p.shape[-1]))
        return np.linalg.norm(p, ord=2, axis=0)
    else:
        perm = list(range(p.ndim))
        perm[0] = dim
        perm[dim] = 0
        return _norm(np.transpose(p, axes=perm))
 class FC(dg.Layer):
    """
    **Fully Connected Layer**
    This function creates a fully connected layer in the network. It can take
    one or multiple tensors as its inputs(input can be a list of Variable, see
    Args in detail). It creates a pair of variables called (magnitude(g), 
    direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected 
    weight matrix from each input unit to each output unit. 
    The fully connected layer multiplies each input tensor
    with its corresponding weight to produce an output Tensor with shape [M, `size`],
    where M is batch size. If multiple input tensors are given, the results of
    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
    is not None, a bias variable will be created and added to the output.
    Finally, if activation is not None, it will be applied to the output as well.
    When the input is single tensor:
    .. math::
        Out = Act({X(normalize(V)g) + b})
    When the input are multiple tensors:
    .. math::
        Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
    In the above equation:
    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
    * :math:`X_i`: The i-th input tensor.
    * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
    * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
    * :math:`b`: The bias parameter created by this layer (if needed).
    * :math:`Act`: The activation function.
    * :math:`Out`: The output tensor.
    See below for an example.
    .. code-block:: text
        Given:
            data_1.data = [[[0.1, 0.2],
                           [0.3, 0.4]]]
            data_1.shape = (1, 2, 2) # 1 is batch_size
            data_2 = [[[0.1, 0.2, 0.3]]]
            data_2.shape = (1, 1, 3)
            out = fluid.layers.fc(input=[data_1, data_2], size=2)
        Then:
            out.data = [[0.18669507, 0.1893476]]
            out.shape = (1, 2)
    Args:
        name_scope(str): The name of this class.
        size(int): The number of output units in this layer.
        num_flatten_dims (int): The fc layer can accept an input tensor with more than
            two dimensions. If this happens, the multidimensional tensor will first be flattened
            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
            dimensions will be flatten to form the first dimension of the final matrix (height of
            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
            form the second dimension of the final matrix (width of the matrix). For example, suppose
            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
        param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
            parameters/weights of this layer.
        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
            If it is set to None, the bias is initialized zero. Default: None.
        act (str|None): Activation to be applied to the output of this layer.
        is_test(bool): A flag indicating whether execution is in test phase. Default: False
        dtype(str): Dtype used for weight
    Raises:
        ValueError: If rank of the input tensor is less than 2.
    Examples:
        .. code-block:: python
          from paddle.fluid.dygraph.base import to_variable
          import paddle.fluid as fluid
          from paddle.fluid.dygraph import FC
          import numpy as np
          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
          with fluid.dygraph.guard():
              fc = FC( "fc", 64, num_flatten_dims=2)
              data = to_variable( data )
              conv = fc( data )
    """
    def __init__(self,
                 name_scope,
                 size,
                 num_flatten_dims=1,
                 epsilon=1e-30,
                 param_attr=None,
                 bias_attr=None,
                 act=None,
                 is_test=False,
                 dtype="float32"):
        super(FC, self).__init__(name_scope, dtype)
        self._size = size
        self._num_flatten_dims = num_flatten_dims
        self._epsilon = epsilon
        self._dtype = dtype
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._act = act
        self.__g = list()
        self.__v = list()
    @property
    def _v(self, i=0):
        return self.__v[i]
    @property
    def _g(self, i=0):
        return self.__g[i]
    @_v.setter
    def _v(self, value, i=0):
        assert isinstance(value, Parameter)
        self.__v[i] = value
    @_g.setter
    def _g(self, value, i=0):
        assert isinstance(value, Parameter)
        self.__g[i] = value
    def _build_once(self, input):
        i = 0
        for inp, param in self._helper.iter_inputs_and_params(input,
                                                              self._param_attr):
            input_shape = inp.shape
            param_shape = [
                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
                       1)
            ] + [self._size]
            self.__v.append(
                self.add_parameter(
                    "_v%d" % i,
                    self.create_parameter(
                        attr=param,
                        shape=param_shape,
                        dtype=self._dtype,
                        is_bias=False)))
            magnitude_shape = param_shape[1:]
            magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0)
            self.__g.append(
                self.add_parameter(
                    "_g%d" % i,
                    self.create_parameter(
                        attr=fluid.ParamAttr(
                            initializer=fluid.initializer.NumpyArrayInitializer(
                                magnitude_value)),
                        shape=magnitude_shape,
                        dtype=self._dtype,
                        is_bias=False)))
            i += 1
        size = list([self._size])
        self._b = self.create_parameter(
            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
    def forward(self, input):
        mul_results = list()
        i = 0
        for inp, param in self._helper.iter_inputs_and_params(input,
                                                              self._param_attr):
            v_norm = self._helper.create_variable_for_type_inference(
                self._dtype)
            v_normalized = self._helper.create_variable_for_type_inference(
                self._dtype)
            self._helper.append_op(
                type="norm",
                inputs={"X": self.__v[i]},
                outputs={"Out": v_normalized,
                         "Norm": v_norm},
                attrs={"axis": 0,
                       "epsilon": self._epsilon})
            weight = self._helper.create_variable_for_type_inference(
                self._dtype)
            self._helper.append_op(
                type="elementwise_mul",
                inputs={"X": [v_normalized],
                        "Y": [self.__g[i]]},
                outputs={"Out": [weight]},
                attrs={"axis": 1})
            tmp = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="mul",
                inputs={"X": inp,
                        "Y": weight},
                outputs={"Out": tmp},
                attrs={
                    "x_num_col_dims": self._num_flatten_dims,
                    "y_num_col_dims": 1
                })
            i += 1
            mul_results.append(tmp)
        if len(mul_results) == 1:
            pre_bias = mul_results[0]
        else:
            pre_bias = self._helper.create_variable_for_type_inference(
                self._dtype)
            self._helper.append_op(
                type="sum",
                inputs={"X": mul_results},
                outputs={"Out": pre_bias},
                attrs={"use_mkldnn": False})
        if self._b:
            pre_activation = self._helper.create_variable_for_type_inference(
                dtype=self._dtype)
            self._helper.append_op(
                type="elementwise_add",
                inputs={"X": [pre_bias],
                        "Y": [self._b]},
                outputs={"Out": [pre_activation]},
                attrs={"axis": self._num_flatten_dims})
        else:
            pre_activation = pre_bias
        # Currently, we don't support inplace in dygraph mode
        return self._helper.append_activation(pre_activation, act=self._act)
 class Conv2D(dg.Layer):
    """
    The convolution2D layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input and
    Output are in NCHW format, where N is batch size, C is the number of
    channels, H is the height of the feature, and W is the width of the feature.
    Filter is in MCHW format, where M is the number of output image channels,
    C is the number of input image channels, H is the height of the filter,
    and W is the width of the filter. If the groups is greater than 1,
    C will equal the number of input image channels divided by the groups.
    Please refer to UFLDL's `convolution
    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
    for more detials.
    If bias attribution and activation type are provided, bias is added to the
    output of the convolution, and the corresponding activation function is
    applied to the final result.
    For each input :math:`X`, the equation is:
    .. math::
        Out = \sigma ((Vg) \\ast X + b)
    Where:
    * :math:`X`: Input value, a tensor with NCHW format.
    * :math:`V`: Filter direction value, a tensor with MCHW format.
    * :math:`g`: Filter magnitude value, a tensor with M format.
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
    Example:
        - Input:
          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
        - Output:
          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
        Where
        .. math::
            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
    Args:
        name_scope(str) : The name for this class.
        num_filters(int): The number of filter. It is as same as the output
            image channel.
        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
            it must contain two integers, (filter_size_H, filter_size_W).
            Otherwise, the filter will be a square.
        stride (int|tuple): The stride size. If stride is a tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: stride = 1.
        padding (int|tuple): The padding size. If padding is a tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: padding = 0.
        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: dilation = 1.
        groups (int): The groups number of the Conv2d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
            the first half of the filters is only connected to the first half
            of the input channels, while the second half of the filters is only
            connected to the second half of the input channels. Default: groups=1.
        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv2d
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
        act (str): Activation type, if it is set to None, activation is not appended.
            Default: None
    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
                    groups mismatch.
    Examples:
        .. code-block:: python
          from paddle.fluid.dygraph.base import to_variable
          import paddle.fluid as fluid
          from paddle.fluid.dygraph import Conv2D
          import numpy as np
          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
          with fluid.dygraph.guard():
              conv2d = Conv2D( "conv2d", 2, 3)
              data = to_variable( data )
              conv = conv2d( data )
    """
    def __init__(self,
                 name_scope,
                 num_filters,
                 filter_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=None,
                 param_attr=None,
                 bias_attr=None,
                 use_cudnn=True,
                 act=None,
                 epsilon=1e-30,
                 dtype="float32"):
        assert param_attr is not False, "param_attr should not be False here."
        super(Conv2D, self).__init__(name_scope, dtype)
        self._groups = groups
        self._stride = utils.convert_to_list(stride, 2, "stride")
        self._padding = utils.convert_to_list(padding, 2, "padding")
        self._dilation = utils.convert_to_list(dilation, 2, "dilation")
        self._act = act
        if not isinstance(use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")
        self._use_cudnn = use_cudnn
        self._filter_size = filter_size
        self._num_filters = num_filters
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._epsilon = epsilon
        self._dtype = dtype
        # if (self._num_channels == self._groups and
        #         num_filters % self._num_channels == 0 and not self._use_cudnn):
        #     self._l_type = 'depthwise_conv2d'
        # else:
        # TODO(jiabin): recover the usage of depthwise_conv2d when it's
        #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
        self._l_type = "conv2d"
    def _build_once(self, input):
        self._num_channels = input.shape[1]
        if self._groups is None:
            num_filter_channels = self._num_channels
        else:
            if self._num_channels % self._groups != 0:
                raise ValueError("num_channels must be divisible by groups.")
            num_filter_channels = self._num_channels // self._groups
        filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size")
        filter_shape = [self._num_filters, int(num_filter_channels)
                        ] + filter_size
        def _get_default_param_initializer():
            filter_elem_num = filter_size[0] * filter_size[
                1] * self._num_channels
            std = (2.0 / filter_elem_num)**0.5
            return Normal(0.0, std, 0)
        # weight_v
        self._filter_param_v = self.create_parameter(
            attr=self._param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            default_initializer=_get_default_param_initializer())
        # weight_g
        norm_value = _norm(
            self._filter_param_v.numpy(), dim=0)  # CAUTION: hard-code
        self._filter_param_g = self.create_parameter(
            attr=fluid.ParamAttr(
                initializer=fluid.initializer.NumpyArrayInitializer(
                    norm_value)),
            shape=norm_value.shape,
            dtype=self._dtype,
            default_initializer=_get_default_param_initializer())
        if self._use_cudnn:
            self.create_variable(
                name="kCUDNNFwdAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
            self.create_variable(
                name="kCUDNNBwdDataAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
            self.create_variable(
                name="kCUDNNBwdFilterAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
        self._bias_param = self.create_parameter(
            attr=self._bias_attr,
            shape=[self._num_filters],
            dtype=self._dtype,
            is_bias=True)
    def forward(self, input):
        matrix = self._helper.create_variable_for_type_inference(self._dtype)
        tmp = self._helper.create_variable_for_type_inference(self._dtype)
        new_shape = [
            self._filter_param_v.shape[0],
            reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
        ]
        self._helper.append_op(
            type="reshape2",
            inputs={"X": self._filter_param_v},
            attrs={"shape": new_shape},
            outputs={"Out": matrix,
                     "XShape": tmp})
        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
        m_normalized = self._helper.create_variable_for_type_inference(
            self._dtype)
        self._helper.append_op(
            type="norm",
            inputs={"X": matrix},
            outputs={"Out": m_normalized,
                     "Norm": m_norm},
            attrs={"axis": 1,
                   "epsilon": self._epsilon})
        v_normalized = self._helper.create_variable_for_type_inference(
            self._dtype)
        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
        self._helper.append_op(
            type="reshape2",
            inputs={"X": m_normalized},
            attrs={"shape": self._filter_param_v.shape},
            outputs={"Out": v_normalized,
                     "XShape": tmp2})
        filter_param = self._helper.create_variable_for_type_inference(
            self._dtype)
        self._helper.append_op(
            type="elementwise_mul",
            inputs={"X": [v_normalized],
                    "Y": [self._filter_param_g]},
            outputs={"Out": [filter_param]},
            attrs={"axis": 0},  # CAUTION: hard-code
        )
        pre_bias = self._helper.create_variable_for_type_inference(
            dtype=self._dtype)
        self._helper.append_op(
            type=self._l_type,
            inputs={"Input": input,
                    "Filter": filter_param},
            outputs={"Output": pre_bias},
            attrs={
                "strides": self._stride,
                "paddings": self._padding,
                "dilations": self._dilation,
                "groups": self._groups if self._groups else 1,
                "use_cudnn": self._use_cudnn,
                "use_mkldnn": False,
            })
        if self._bias_param is not None:
            pre_act = self._helper.create_variable_for_type_inference(
                dtype=self._dtype)
            self._helper.append_op(
                type="elementwise_add",
                inputs={"X": [pre_bias],
                        "Y": [self._bias_param]},
                outputs={"Out": [pre_act]},
                attrs={"axis": 1})
        else:
            pre_act = pre_bias
        # Currently, we don't support inplace in dygraph mode
        return self._helper.append_activation(pre_act, act=self._act)
 class Conv2DTranspose(dg.Layer):
    """
    **Convlution2D transpose layer**
    The convolution2D transpose layer calculates the output based on the input,
    filter, and dilations, strides, paddings. Input(Input) and output(Output)
    are in NCHW format. Where N is batch size, C is the number of channels,
    H is the height of the feature, and W is the width of the feature.
    Parameters(dilations, strides, paddings) are two elements. These two elements
    represent height and width, respectively. The details of convolution transpose
    layer, please refer to the following explanation and references
    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
    If bias attribution and activation type are provided, bias is added to
    the output of the convolution, and the corresponding activation function
    is applied to the final result.
    For each input :math:`X`, the equation is:
    .. math::
        Out = \sigma ((Vg) \\ast X + b)
    Where:
    * :math:`X`: Input value, a tensor with NCHW format.
    * :math:`V`: Filter value, a tensor with MCHW format.
    * :math:`g`: Filter value, a tensor with M format.
    * :math:`\\ast`: Convolution operation.
    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
    * :math:`\\sigma`: Activation function.
    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
    Example:
        - Input:
          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
        - Output:
          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
        Where
        .. math::
           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
    Args:
        name_scope(str): The name of this class.
        num_filters(int): The number of the filter. It is as same as the output
            image channel.
        output_size(int|tuple|None): The output image size. If output size is a
            tuple, it must contain two integers, (image_H, image_W). None if use
            filter_size, padding, and stride to calculate output_size.
            if output_size and filter_size are specified at the same time, They
            should follow the formula above. Default: None.
        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
            it must contain two integers, (filter_size_H, filter_size_W).
            Otherwise, the filter will be a square. None if use output size to
            calculate filter_size. Default: None.
        padding(int|tuple): The padding size. If padding is a tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: padding = 0.
        stride(int|tuple): The stride size. If stride is a tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: stride = 1.
        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: dilation = 1.
        groups(int): The groups number of the Conv2d transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
            first half of the input channels, while the second half of the
            filters is only connected to the second half of the input channels.
            Default: groups = 1.
        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
            will create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
            If it is set to False, no bias will be added to the output units.
            If it is set to None or one attribute of ParamAttr, conv2d_transpose
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True.
        act (str): Activation type, if it is set to None, activation is not appended.
            Default: None.
    Returns:
        Variable: The tensor variable storing the convolution transpose result.
    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
                    groups mismatch.
    Examples:
       .. code-block:: python
          import paddle.fluid as fluid
          import numpy
          with fluid.dygraph.guard():
              data = numpy.random.random((3, 32, 32)).astype('float32')
              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
                    'Conv2DTranspose', num_filters=2, filter_size=3)
              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
    """
    def __init__(self,
                 name_scope,
                 num_filters,
                 output_size=None,
                 filter_size=None,
                 padding=0,
                 stride=1,
                 dilation=1,
                 groups=None,
                 param_attr=None,
                 bias_attr=None,
                 use_cudnn=True,
                 epsilon=1e-30,
                 act=None,
                 dtype="float32"):
        super(Conv2DTranspose, self).__init__(name_scope, dtype)
        assert (param_attr is not False
                ), "param_attr should not be False in conv2d_transpose."
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._groups = groups
        self._num_filters = num_filters
        self._use_cudnn = use_cudnn
        self._padding = padding
        self._stride = stride
        self._dilation = dilation
        self._filter_size = filter_size
        self._output_size = output_size
        self._op_type = "conv2d_transpose"
        self._epsilon = epsilon
    def _build_once(self, input):
        input_channel = input.shape[1]
        if (input_channel == self._groups and
                self._num_filters == input_channel and not self._use_cudnn):
            self._op_type = "depthwise_conv2d_transpose"
        if not isinstance(input, Variable):
            raise TypeError("Input of conv2d_transpose must be Variable")
        self._padding = utils.convert_to_list(self._padding, 2, "padding")
        self._stride = utils.convert_to_list(self._stride, 2, "stride")
        self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
        if not isinstance(self._use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")
        if self._filter_size is None:
            if self._output_size is None:
                raise ValueError(
                    "output_size must be set when filter_size is None")
            if isinstance(self._output_size, int):
                self._output_size = [self._output_size, self._output_size]
            h_in = input.shape[2]
            w_in = input.shape[3]
            filter_size_h = (self._output_size[0] -
                             (h_in - 1) * self._stride[0] + 2 * self._padding[0]
                             - 1) // self._dilation[0] + 1
            filter_size_w = (self._output_size[1] -
                             (w_in - 1) * self._stride[1] + 2 * self._padding[1]
                             - 1) // self._dilation[1] + 1
            self._filter_size = [filter_size_h, filter_size_w]
        else:
            self._filter_size = utils.convert_to_list(
                self._filter_size, 2, "conv2d_transpose.filter_size")
        if self._output_size is None:
            self._output_size = []
        elif isinstance(self._output_size, list) or isinstance(
                self._output_size, int):
            self._output_size = utils.convert_to_list(self._output_size, 2,
                                                      "output_size")
        else:
            raise ValueError("output_size should be list or int")
        self._padding = utils.convert_to_list(self._padding, 2, "padding")
        self._groups = 1 if self._groups is None else self._groups
        filter_shape = [
            input_channel,
            self._num_filters // self._groups,
        ] + self._filter_size
        # img filter v (direction)
        self._img_filter_v = self.create_parameter(
            dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
        # img filter g (magnitude)
        img_filter_magnitude = _norm(
            self._img_filter_v.numpy(), dim=0)  # CAUTION: hard-code
        self._img_filter_g = self.create_parameter(
            dtype=input.dtype,
            shape=img_filter_magnitude.shape,
            attr=fluid.ParamAttr(
                initializer=NumpyArrayInitializer(img_filter_magnitude)))
        self._img_bias = self.create_parameter(
            attr=self._bias_attr,
            shape=[self._num_filters],
            dtype=self._dtype,
            is_bias=True)
    def forward(self, input):
        matrix = self._helper.create_variable_for_type_inference(self._dtype)
        tmp = self._helper.create_variable_for_type_inference(self._dtype)
        new_shape = [
            self._img_filter_v.shape[0],
            reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
        ]
        self._helper.append_op(
            type="reshape2",
            inputs={"X": self._img_filter_v},
            attrs={"shape": new_shape},
            outputs={"Out": matrix,
                     "XShape": tmp})
        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
        m_normalized = self._helper.create_variable_for_type_inference(
            self._dtype)
        self._helper.append_op(
            type="norm",
            inputs={"X": matrix},
            outputs={"Out": m_normalized,
                     "Norm": m_norm},
            attrs={"axis": 1,
                   "epsilon": self._epsilon})
        v_normalized = self._helper.create_variable_for_type_inference(
            self._dtype)
        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
        self._helper.append_op(
            type="reshape2",
            inputs={"X": m_normalized},
            attrs={"shape": self._img_filter_v.shape},
            outputs={"Out": v_normalized,
                     "XShape": tmp2})
        img_filter = self._helper.create_variable_for_type_inference(
            self._dtype)
        self._helper.append_op(
            type="elementwise_mul",
            inputs={"X": [v_normalized],
                    "Y": [self._img_filter_g]},
            outputs={"Out": [img_filter]},
            attrs={"axis": 0},  # CAUTION: hard-code
        )
        pre_bias = self._helper.create_variable_for_type_inference(
            dtype=input.dtype)
        self._helper.append_op(
            type=self._op_type,
            inputs={"Input": [input],
                    "Filter": [img_filter]},
            outputs={"Output": pre_bias},
            attrs={
                "output_size": self._output_size,
                "strides": self._stride,
                "paddings": self._padding,
                "dilations": self._dilation,
                "groups": self._groups,
                "use_cudnn": self._use_cudnn,
            })
        if self._img_bias is not None:
            pre_act = self._helper.create_variable_for_type_inference(
                dtype=self._dtype)
            self._helper.append_op(
                type="elementwise_add",
                inputs={"X": [pre_bias],
                        "Y": [self._img_bias]},
                outputs={"Out": [pre_act]},
                attrs={"axis": 1})
        else:
            pre_act = pre_bias
        out = self._helper.append_activation(pre_act)
        return out