diff --git a/parakeet/data/sampler.py b/parakeet/data/sampler.py
index 097cc03..60aa5db 100644
--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
@@ -163,6 +163,35 @@ class WeightedRandomSampler(Sampler):
         return self.num_samples
 
 
+class DistributedSampler(Sampler):
+    def __init__(self, dataset_size, num_trainers, rank, shuffle=True):
+        self.dataset_size = dataset_size
+        self.num_trainers = num_trainers
+        self.rank = rank
+        self.num_samples = int(np.ceil(dataset_size / num_trainers))
+        self.total_size = self.num_samples * num_trainers
+        assert self.total_size >= self.dataset_size
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        indices = list(range(self.dataset_size))
+        if self.shuffle:
+            random.shuffle(indices)
+
+        # Append extra samples to make it evenly distributed on all trainers.
+        indices += indices[:(self.total_size - self.dataset_size)]
+        assert len(indices) == self.total_size
+
+        # Subset samples for each trainer.
+        indices = indices[self.rank:self.total_size:self.num_trainers]
+        assert len(indices) ==  self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
 class BatchSampler(Sampler):
     r"""Wraps another sampler to yield a mini-batch of indices.
     Args:
@@ -206,4 +235,4 @@ class BatchSampler(Sampler):
         if self.drop_last:
             return len(self.sampler) // self.batch_size
         else:
-            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
\ No newline at end of file
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
diff --git a/parakeet/models/wavenet/configs/wavenet_ljspeech_mix_gaussian.yaml b/parakeet/models/wavenet/configs/wavenet_ljspeech_mix_gaussian.yaml
new file mode 100644
index 0000000..bf19577
--- /dev/null
+++ b/parakeet/models/wavenet/configs/wavenet_ljspeech_mix_gaussian.yaml
@@ -0,0 +1,32 @@
+valid_size: 16
+train_clip_second: 0.5
+sample_rate: 22050
+fft_window_shift: 256
+fft_window_size: 1024
+fft_size: 2048
+mel_bands: 80
+
+seed: 1
+batch_size: 8
+test_every: 2000
+save_every: 10000
+max_iterations: 2000000
+
+layers: 30
+kernel_width: 2
+dilation_block: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+residual_channels: 128
+skip_channels: 128
+loss_type: mix-gaussian-pdf
+num_mixtures: 10
+log_scale_min: -9.0
+
+conditioner: 
+  filter_sizes: [[32, 3], [32, 3]]
+  upsample_factors: [16, 16]
+
+learning_rate: 0.001
+gradient_max_norm: 100.0
+anneal: 
+  every: 200000
+  rate: 0.5
diff --git a/parakeet/models/wavenet/configs/wavenet_ljspeech_softmax.yaml b/parakeet/models/wavenet/configs/wavenet_ljspeech_softmax.yaml
new file mode 100644
index 0000000..f39de5d
--- /dev/null
+++ b/parakeet/models/wavenet/configs/wavenet_ljspeech_softmax.yaml
@@ -0,0 +1,31 @@
+valid_size: 16
+train_clip_second: 0.5
+sample_rate: 22050
+fft_window_shift: 256
+fft_window_size: 1024
+fft_size: 2048
+mel_bands: 80
+
+seed: 1
+batch_size: 8
+test_every: 2000
+save_every: 10000
+max_iterations: 2000000
+
+layers: 30
+kernel_width: 2
+dilation_block: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+residual_channels: 128
+skip_channels: 128
+loss_type: softmax
+num_channels: 2048
+
+conditioner: 
+  filter_sizes: [[32, 3], [32, 3]]
+  upsample_factors: [16, 16]
+
+learning_rate: 0.001
+gradient_max_norm: 100.0
+anneal: 
+  every: 200000
+  rate: 0.5
diff --git a/parakeet/models/wavenet/data.py b/parakeet/models/wavenet/data.py
index 61cc4ab..a4f1b70 100644
--- a/parakeet/models/wavenet/data.py
+++ b/parakeet/models/wavenet/data.py
@@ -1,5 +1,3 @@
-import math
-import os
 import random
 
 import librosa
@@ -9,7 +7,7 @@ from paddle import fluid
 import utils
 from parakeet.datasets import ljspeech
 from parakeet.data import dataset
-from parakeet.data.sampler import Sampler, BatchSampler, SequentialSampler
+from parakeet.data.sampler import DistributedSampler, BatchSampler
 from parakeet.data.datacargo import DataCargo
 
 
@@ -20,7 +18,7 @@ class Dataset(ljspeech.LJSpeech):
         self.fft_window_shift = config.fft_window_shift
         # Calculate context frames.
         frames_per_second = config.sample_rate // self.fft_window_shift
-        train_clip_frames = int(math.ceil(
+        train_clip_frames = int(np.ceil(
             config.train_clip_second * frames_per_second))
         context_frames = config.context_size // self.fft_window_shift
         self.num_frames = train_clip_frames + context_frames
@@ -39,7 +37,7 @@ class Dataset(ljspeech.LJSpeech):
         assert loaded_sr == sr
 
         # Pad audio to the right size.
-        frames = math.ceil(float(audio.size) / fft_window_shift)
+        frames = int(np.ceil(float(audio.size) / fft_window_shift))
         fft_padding = (fft_size - fft_window_shift) // 2
         desired_length = frames * fft_window_shift + fft_padding * 2
         pad_amount = (desired_length - audio.size) // 2
@@ -125,35 +123,6 @@ class Subset(dataset.Dataset):
         return len(self.indices)
 
 
-class DistributedSampler(Sampler):
-    def __init__(self, dataset_size, num_trainers, rank, shuffle=True):
-        self.dataset_size = dataset_size
-        self.num_trainers = num_trainers
-        self.rank = rank
-        self.num_samples = int(math.ceil(dataset_size / num_trainers))
-        self.total_size = self.num_samples * num_trainers
-        assert self.total_size >= self.dataset_size
-        self.shuffle = shuffle
-    
-    def __iter__(self):
-        indices = list(range(self.dataset_size))
-        if self.shuffle:
-            random.shuffle(indices)
-        
-        # Append extra samples to make it evenly distributed on all trainers.
-        indices += indices[:(self.total_size - self.dataset_size)]
-        assert len(indices) == self.total_size
-
-        # Subset samples for each trainer.
-        indices = indices[self.rank:self.total_size:self.num_trainers]
-        assert len(indices) ==  self.num_samples
-
-        return iter(indices)
-
-    def __len__(self):
-        return self.num_samples
-
-
 class LJSpeech:
     def __init__(self, config, nranks, rank):
         place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
diff --git a/parakeet/models/wavenet/ops.py b/parakeet/models/wavenet/ops.py
deleted file mode 100644
index 6eda2a9..0000000
--- a/parakeet/models/wavenet/ops.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-import numpy as np
-
-import weight_norm
-
-
-def Embedding(name_scope,
-              num_embeddings,
-              embed_dim,
-              padding_idx=None,
-              std=0.1,
-              dtype="float32"):
-    # param attrs
-    weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
-        scale=std))
-    layer = dg.Embedding(
-        name_scope, (num_embeddings, embed_dim),
-        padding_idx=padding_idx,
-        param_attr=weight_attr,
-        dtype=dtype)
-    return layer
-
-
-def FC(name_scope,
-       in_features,
-       size,
-       num_flatten_dims=1,
-       relu=False,
-       dropout=0.0,
-       act=None,
-       dtype="float32"):
-    """
-    A special Linear Layer, when it is used with dropout, the weight is 
-    initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
-    """
-
-    # stds
-    if isinstance(in_features, int):
-        in_features = [in_features]
-
-    stds = [np.sqrt((1.0 - dropout) / in_feature) for in_feature in in_features]
-    if relu:
-        stds = [std * np.sqrt(2.0) for std in stds]
-
-    weight_inits = [
-        fluid.initializer.NormalInitializer(scale=std) for std in stds
-    ]
-    bias_init = fluid.initializer.ConstantInitializer(0.0)
-
-    # param attrs
-    weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
-    bias_attr = fluid.ParamAttr(initializer=bias_init)
-
-    layer = weight_norm.FC(name_scope,
-                           size,
-                           num_flatten_dims=num_flatten_dims,
-                           param_attr=weight_attrs,
-                           bias_attr=bias_attr,
-                           act=act,
-                           dtype=dtype)
-    return layer
-
-
-def Conv1D(name_scope,
-           in_channels,
-           num_filters,
-           filter_size=2,
-           dilation=1,
-           groups=None,
-           causal=False,
-           std_mul=1.0,
-           dropout=0.0,
-           use_cudnn=True,
-           act=None,
-           dtype="float32"):
-    """
-    A special Conv1D Layer, when it is used with dropout, the weight is 
-    initialized as 
-    normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_channels)))
-    """
-    # std
-    std = np.sqrt((std_mul * (1.0 - dropout)) / (filter_size * in_channels))
-    weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
-    bias_init = fluid.initializer.ConstantInitializer(0.0)
-
-    # param attrs
-    weight_attr = fluid.ParamAttr(initializer=weight_init)
-    bias_attr = fluid.ParamAttr(initializer=bias_init)
-
-    layer = weight_norm.Conv1D(
-        name_scope,
-        num_filters,
-        filter_size,
-        dilation,
-        groups=groups,
-        causal=causal,
-        param_attr=weight_attr,
-        bias_attr=bias_attr,
-        use_cudnn=use_cudnn,
-        act=act,
-        dtype=dtype)
-    return layer
-
-
-class Conv1D_GU(dg.Layer):
-    def __init__(self,
-                 name_scope,
-                 conditioner_dim,
-                 in_channels,
-                 num_filters,
-                 filter_size,
-                 dilation,
-                 causal=False,
-                 residual=True,
-                 dtype="float32"):
-        super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)
-
-        self.conditioner_dim = conditioner_dim
-        self.in_channels = in_channels
-        self.num_filters = num_filters
-        self.filter_size = filter_size
-        self.dilation = dilation
-        self.causal = causal
-        self.residual = residual
-
-        if residual:
-            assert (
-                in_channels == num_filters
-            ), "this block uses residual connection"\
-                "the input_channels should equals num_filters"
-
-        self.conv = Conv1D(
-            self.full_name(),
-            in_channels,
-            2 * num_filters,
-            filter_size,
-            dilation,
-            causal=causal,
-            dtype=dtype)
-
-        self.fc = Conv1D(
-            self.full_name(),
-            conditioner_dim,
-            2 * num_filters,
-            filter_size=1,
-            dilation=1,
-            causal=False,
-            dtype=dtype)
-
-    def forward(self, x, skip=None, conditioner=None):
-        """
-        Args:
-            x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
-                layer, where B means batch_size, C_in means the input channels
-                T means input time steps.
-            conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
-                conditioner, where C_con is conditioner hidden dim which
-                equals the num of mel bands. Note that when using residual
-                connection, the Conv1DGLU does not change the number of
-                channels, so out channels equals input channels.
-        Returns:
-            x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
-                C_out means the output channels of Conv1DGLU.
-        """
-        residual = x
-        x = self.conv(x)
-
-        if conditioner is not None:
-            cond_bias = self.fc(conditioner)
-            x += cond_bias
-
-        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
-
-        # Gated Unit.
-        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
-                                         fluid.layers.tanh(content))
-
-        if skip is None:
-            skip = x
-        else:
-            skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
-
-        if self.residual:
-            x = fluid.layers.scale(residual + x, np.sqrt(0.5))
-
-        return x, skip
-
-    def add_input(self, x, skip=None, conditioner=None):
-        """
-        Inputs:
-        x: shape(B, num_filters, 1, time_steps)
-        conditioner: shape(B, conditioner_dim, 1, time_steps)
-        Outputs:
-        out: shape(B, num_filters, 1, time_steps), where time_steps = 1
-        """
-        residual = x
-
-        # add step input and produce step output
-        x = self.conv.add_input(x)
-
-        if conditioner is not None:
-            cond_bias = self.fc(conditioner)
-            x += cond_bias
-
-        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
-
-        # Gated Unit.
-        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
-                                         fluid.layers.tanh(content))
-
-        if skip is None:
-            skip = x
-        else:
-            skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
-
-        if self.residual:
-            x = fluid.layers.scale(residual + x, np.sqrt(0.5))
-
-        return x, skip
-
-
-def Conv2DTranspose(name_scope,
-                    num_filters,
-                    filter_size,
-                    padding=0,
-                    stride=1,
-                    dilation=1,
-                    use_cudnn=True,
-                    act=None,
-                    dtype="float32"):
-    val = 1.0 / (filter_size[0] * filter_size[1])
-    weight_init = fluid.initializer.ConstantInitializer(val)
-    weight_attr = fluid.ParamAttr(initializer=weight_init)
-
-    layer = weight_norm.Conv2DTranspose(
-        name_scope,
-        num_filters,
-        filter_size=filter_size,
-        padding=padding,
-        stride=stride,
-        dilation=dilation,
-        param_attr=weight_attr,
-        use_cudnn=use_cudnn,
-        act=act,
-        dtype=dtype)
-
-    return layer
diff --git a/parakeet/models/wavenet/wavenet.py b/parakeet/models/wavenet/wavenet.py
index acc6e76..c636c4b 100644
--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
@@ -4,12 +4,12 @@ import time
 
 import librosa
 import numpy as np
-from paddle import fluid
 import paddle.fluid.dygraph as dg
+from paddle import fluid
 
 import utils
 from data import LJSpeech
-from wavenet_modules import WaveNetModule, debug
+from wavenet_modules import WaveNetModule
 
 
 class WaveNet():
@@ -33,18 +33,6 @@ class WaveNet():
         self.trainloader = dataset.trainloader
         self.validloader = dataset.validloader
 
-#        if self.rank == 0:
-#            for i, (audios, mels, ids) in enumerate(self.validloader()):
-#                print("audios {}, mels {}, ids {}".format(audios.dtype, mels.dtype, ids.dtype))
-#                print("{}: rank {}, audios {}, mels {}, indices {} / {}".format(
-#                    i, self.rank, audios.shape, mels.shape, ids.shape,
-#                    ids.numpy()))
-#    
-#            for i, (audios, mels, ids) in enumerate(self.trainloader):
-#                print("{}: rank {}, audios {}, mels {}, indices {} / {}".format(
-#                    i, self.rank, audios.shape, mels.shape, ids.shape,
-#                    ids.numpy()))
-
         wavenet = WaveNetModule("wavenet", config, self.rank)
         
         # Dry run once to create and initalize all necessary parameters.
@@ -139,8 +127,8 @@ class WaveNet():
         self.wavenet.eval()
 
         total_loss = []
-        start_time = time.time()
         sample_audios = []
+        start_time = time.time()
         for audios, mels, audio_starts in self.validloader():
             loss, sample_audio = self.wavenet(audios, mels, audio_starts, True)
             total_loss.append(float(loss.numpy()))
@@ -160,11 +148,6 @@ class WaveNet():
             tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
                 iteration, sample_rate=self.config.sample_rate)
 
-    def save(self, iteration):
-        utils.save_latest_parameters(self.checkpoint_dir, iteration,
-                                     self.wavenet, self.optimizer)
-        utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
-
     @dg.no_grad
     def infer(self, iteration):
         self.wavenet.eval()
@@ -186,3 +169,8 @@ class WaveNet():
             syn_audio.shape, syn_time))
         librosa.output.write_wav(filename, syn_audio,
             sr=config.sample_rate)
+
+    def save(self, iteration):
+        utils.save_latest_parameters(self.checkpoint_dir, iteration,
+                                     self.wavenet, self.optimizer)
+        utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
diff --git a/parakeet/models/wavenet/wavenet_modules.py b/parakeet/models/wavenet/wavenet_modules.py
index c5c01e9..fbab741 100644
--- a/parakeet/models/wavenet/wavenet_modules.py
+++ b/parakeet/models/wavenet/wavenet_modules.py
@@ -1,11 +1,9 @@
 import itertools
-import math
 
 import numpy as np
-from paddle import fluid
 import paddle.fluid.dygraph as dg
-import ops
-import weight_norm
+from paddle import fluid
+from parakeet.modules import conv, modules
 
 
 def get_padding(filter_size, stride, padding_type='same'):
@@ -16,22 +14,6 @@ def get_padding(filter_size, stride, padding_type='same'):
     return padding
 
 
-def debug(x, var_name, rank, verbose=False):
-    if not verbose and rank != 0:
-        return
-    dim = len(x.shape)
-    if not isinstance(x, np.ndarray):
-        x = x.numpy()
-    if dim == 1:
-        print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x))
-    elif dim == 2:
-        print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5]))
-    elif dim == 3:
-        print("Rank {}".format(rank), var_name, "shape {}, value {}".format(x.shape, x[:, :5, 0]))
-    else:
-        print("Rank", rank, var_name, "shape", x.shape)
-
-
 def extract_slices(x, audio_starts, audio_length, rank):
     slices = []
     for i in range(x.shape[0]): 
@@ -58,7 +40,7 @@ class Conditioner(dg.Layer):
             stride = (up_scale, 1)
             padding = get_padding(filter_sizes[i], stride)
             self.deconvs.append(
-                ops.Conv2DTranspose(
+                modules.Conv2DTranspose(
                     self.full_name(),
                     num_filters=1,
                     filter_size=filter_sizes[i],
@@ -94,12 +76,13 @@ class WaveNetModule(dg.Layer):
         print("context_size", self.context_size)
 
         if config.loss_type == "softmax":
-            self.embedding_fc = ops.Embedding(
+            self.embedding_fc = modules.Embedding(
                 self.full_name(),
                 num_embeddings=config.num_channels,
-                embed_dim=config.residual_channels)
+                embed_dim=config.residual_channels,
+                std=0.1)
         elif config.loss_type == "mix-gaussian-pdf":
-            self.embedding_fc = ops.FC(
+            self.embedding_fc = modules.FC(
                 self.full_name(),
                 in_features=1,
                 size=config.residual_channels,
@@ -112,7 +95,7 @@ class WaveNetModule(dg.Layer):
         self.dilated_causal_convs = []
         for dilation in self.dilations:
             self.dilated_causal_convs.append(
-                ops.Conv1D_GU(
+                modules.Conv1D_GU(
                     self.full_name(),
                     conditioner_dim=config.mel_bands,
                     in_channels=config.residual_channels,
@@ -126,7 +109,7 @@ class WaveNetModule(dg.Layer):
         for i, layer in enumerate(self.dilated_causal_convs):
             self.add_sublayer("dilated_causal_conv_{}".format(i), layer) 
 
-        self.fc1 = ops.FC(
+        self.fc1 = modules.FC(
             self.full_name(),
             in_features=config.residual_channels,
             size=config.skip_channels,
@@ -134,7 +117,7 @@ class WaveNetModule(dg.Layer):
             relu=True,
             act="relu")
 
-        self.fc2 = ops.FC(
+        self.fc2 = modules.FC(
             self.full_name(),
             in_features=config.skip_channels,
             size=config.skip_channels,
@@ -143,14 +126,14 @@ class WaveNetModule(dg.Layer):
             act="relu")
 
         if config.loss_type == "softmax":
-            self.fc3 = ops.FC(
+            self.fc3 = modules.FC(
                 self.full_name(),
                 in_features=config.skip_channels,
                 size=config.num_channels,
                 num_flatten_dims=2,
                 relu=False)
         elif config.loss_type == "mix-gaussian-pdf":
-            self.fc3 = ops.FC(
+            self.fc3 = modules.FC(
                 self.full_name(),
                 in_features=config.skip_channels,
                 size=3 * config.num_mixtures,
@@ -175,8 +158,8 @@ class WaveNetModule(dg.Layer):
         return samples
 
     def sample_mix_gaussian(self, mix_parameters):
-        # mix_parameters reshape from [bs, 13799, 3 * num_mixtures]
-        # to [bs * 13799, 3 * num_mixtures].
+        # mix_parameters reshape from [bs, len, 3 * num_mixtures]
+        # to [bs * len, 3 * num_mixtures].
         batch, length, hidden = mix_parameters.shape
         mix_param_2d = fluid.layers.reshape(mix_parameters,
             [batch * length, hidden])
@@ -197,7 +180,7 @@ class WaveNetModule(dg.Layer):
         mu_comp = fluid.layers.gather_nd(mu, comp_samples)
         s_comp = fluid.layers.gather_nd(s, comp_samples) 
 
-        # N(0, 1) Normal Sample.
+        # N(0, 1) normal sample.
         u = fluid.layers.gaussian_random(shape=[batch * length])
         samples = mu_comp + u * s_comp
         samples = fluid.layers.clip(samples, min=-1.0, max=1.0)
@@ -205,8 +188,6 @@ class WaveNetModule(dg.Layer):
         return samples
 
     def softmax_loss(self, targets, mix_parameters):
-        # targets: [bs, 13799] -> [bs, 11752]
-        # mix_params: [bs, 13799, 3] -> [bs, 11752, 3]
         targets = targets[:, self.context_size:]
         mix_parameters = mix_parameters[:, self.context_size:, :]
 
@@ -216,22 +197,22 @@ class WaveNetModule(dg.Layer):
         quantized = fluid.layers.cast(
             (targets + 1.0) / 2.0 * num_channels, dtype="int64")
 
-        # per_sample_loss shape: [bs, 17952, 1]
+        # per_sample_loss shape: [bs, len, 1]
         per_sample_loss = fluid.layers.softmax_with_cross_entropy(
             logits=mix_parameters, label=fluid.layers.unsqueeze(quantized, 2))
         loss = fluid.layers.reduce_mean(per_sample_loss)
-        #debug(loss, "softmax loss", self.rank)
 
         return loss
 
     def mixture_density_loss(self, targets, mix_parameters, log_scale_min):
-        # targets: [bs, 13799] -> [bs, 11752]
-        # mix_params: [bs, 13799, 3] -> [bs, 11752, 3]
+        # targets: [bs, len]
+        # mix_params: [bs, len, 3 * num_mixture]
         targets = targets[:, self.context_size:]
         mix_parameters = mix_parameters[:, self.context_size:, :]
 
-        # log_s: [bs, 11752, num_mixture]
-        logits_pi, mu, log_s = fluid.layers.split(mix_parameters, num_or_sections=3, dim=-1)
+        # log_s: [bs, len, num_mixture]
+        logits_pi, mu, log_s = fluid.layers.split(
+            mix_parameters, num_or_sections=3, dim=-1)
 
         pi = fluid.layers.softmax(logits_pi, axis=-1)
         log_s = fluid.layers.clip(log_s, min=log_scale_min, max=100.0)
@@ -242,10 +223,9 @@ class WaveNetModule(dg.Layer):
         targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
         x_std =  inv_s * (targets - mu)
         exponent = fluid.layers.exp(-0.5 * x_std * x_std)
-        # pdf_x: [bs, 11752, 1]
         pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
         pdf_x = pi * pdf_x
-        # pdf_x: [bs, 11752]
+        # pdf_x: [bs, len]
         pdf_x = fluid.layers.reduce_sum(pdf_x, dim=-1)
         per_sample_loss = 0.0 - fluid.layers.log(pdf_x + 1e-9)
 
@@ -254,8 +234,6 @@ class WaveNetModule(dg.Layer):
         return loss
 
     def forward(self, audios, mels, audio_starts, sample=False):
-        # audios: [bs, 13800], mels: [bs, full_frame_length, 80]
-        # audio_starts: [bs]
         # Build conditioner based on mels.
         full_conditioner = self.conditioner(mels)
 
@@ -264,15 +242,14 @@ class WaveNetModule(dg.Layer):
         conditioner = extract_slices(full_conditioner,
             audio_starts, audio_length, self.rank)
     
-        # input_audio, target_audio: [bs, 13799]
+        # input_audio, target_audio: [bs, len]
         input_audios = audios[:, :-1]
         target_audios = audios[:, 1:]
-        # conditioner: [bs, 13799, 80]
+        # conditioner: [bs, len, mel_bands]
         conditioner = conditioner[:, 1:, :]
 
         loss_type = self.config.loss_type
 
-        # layer_input: [bs, 13799, 128]
         if loss_type == "softmax":
             input_audios = fluid.layers.clip(
                 input_audios, min=-1.0, max=0.99999)
@@ -280,31 +257,31 @@ class WaveNetModule(dg.Layer):
             quantized = fluid.layers.cast(
                 (input_audios + 1.0) / 2.0 * self.config.num_channels,
                 dtype="int64")
-            layer_input = self.embedding_fc(fluid.layers.unsqueeze(quantized, 2))
+            layer_input = self.embedding_fc(
+                fluid.layers.unsqueeze(quantized, 2))
         elif loss_type == "mix-gaussian-pdf":
-            layer_input = self.embedding_fc(fluid.layers.unsqueeze(input_audios, 2))
+            layer_input = self.embedding_fc(
+                fluid.layers.unsqueeze(input_audios, 2))
         else:
             raise ValueError(
                 "loss_type {} is unsupported!".format(loss_type))
 
-        # layer_input: [bs, res_channel, 1, 13799]
-        layer_input = fluid.layers.unsqueeze(fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
-        # conditioner: [bs, mel_bands, 1, 13799]
-        conditioner = fluid.layers.unsqueeze(fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
+        # layer_input: [bs, res_channel, 1, len]
+        layer_input = fluid.layers.unsqueeze(
+            fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
+        # conditioner: [bs, mel_bands, 1, len]
+        conditioner = fluid.layers.unsqueeze(
+            fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
 
-        # layer_input: [bs, res_channel, 1, 13799]
-        # skip: [bs, res_channel, 1, 13799]
         skip = None
         for i, layer in enumerate(self.dilated_causal_convs):
+            # layer_input: [bs, res_channel, 1, len]
+            # skip: [bs, res_channel, 1, len]
             layer_input, skip = layer(layer_input, skip, conditioner)
-            #debug(layer_input, "layer_input_" + str(i), self.rank)
-            #debug(skip, "skip_" + str(i), self.rank)
 
-        # Reshape skip to [bs, 13799, res_channel]
-        skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
-        #debug(skip, "skip", self.rank)
-
-        # mix_param: [bs, 13799, 3 * num_mixtures]
+        # Reshape skip to [bs, len, res_channel]
+        skip = fluid.layers.transpose(
+            fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
         mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
 
         # Sample teacher-forced audio.
@@ -317,12 +294,7 @@ class WaveNetModule(dg.Layer):
             else:
                 raise ValueError(
                     "loss_type {} is unsupported!".format(loss_type))
-            #debug(sample_audios, "sample_audios", self.rank)
 
-        # Calculate mix-gaussian density loss.
-        # padding is all zero.
-        # target_audio: [bs, 13799].
-        # mix_params: [bs, 13799, 3].
         if loss_type == "softmax":
             loss = self.softmax_loss(target_audios, mix_parameters)
         elif loss_type == "mix-gaussian-pdf":
@@ -332,27 +304,16 @@ class WaveNetModule(dg.Layer):
             raise ValueError(
                 "loss_type {} is unsupported!".format(loss_type))
 
-        #print("Rank {}, loss {}".format(self.rank, loss.numpy()))
- 
         return loss, sample_audios
 
     def synthesize(self, mels):
         self.start_new_sequence()
-        print("input mels shape", mels.shape)
-        # mels: [bs=1, n_frames, 80]
-        # conditioner: [1, n_frames * samples_per_frame, 80]
-        # Should I move forward by one sample? No difference
-        # Append context frame to mels
         bs, n_frames, mel_bands = mels.shape 
-        #num_pad_frames = int(np.ceil(self.context_size / self.config.fft_window_shift))
-        #silence = fluid.layers.zeros(shape=[bs, num_pad_frames, mel_bands], dtype="float32")
-        #inf_mels = fluid.layers.concat([silence, mels], axis=1)
-        #print("padded mels shape", inf_mels.shape)
-
-        #conditioner = self.conditioner(inf_mels)[:, self.context_size:, :]
         conditioner = self.conditioner(mels)
         time_steps = conditioner.shape[1]
-        print("Total steps", time_steps)
+
+        print("input mels shape", mels.shape)
+        print("Total synthesis steps", time_steps)
 
         loss_type = self.config.loss_type
         audio_samples = []
@@ -361,8 +322,8 @@ class WaveNetModule(dg.Layer):
             if i % 100 == 0:
                 print("Step", i)
 
-            # convert from real value sample to audio embedding.
-            # [bs, 1, 128]
+            # Convert from real value sample to audio embedding.
+            # audio_input: [bs, 1, channel]
             if loss_type == "softmax":
                 current_sample = fluid.layers.clip(
                     current_sample, min=-1.0, max=0.99999)
@@ -377,21 +338,23 @@ class WaveNetModule(dg.Layer):
                 raise ValueError(
                     "loss_type {} is unsupported!".format(loss_type))
 
-            # [bs, 128, 1, 1]
-            audio_input = fluid.layers.unsqueeze(fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
-            # [bs, 80]
+            # [bs, channel, 1, 1]
+            audio_input = fluid.layers.unsqueeze(
+                fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
+            # [bs, mel_bands]
             cond_input = conditioner[:, i, :]
-            # [bs, 80, 1, 1]
+            # [bs, mel_bands, 1, 1]
             cond_input = fluid.layers.reshape(
                 cond_input, cond_input.shape + [1, 1])
 
             skip = None
             for layer in self.dilated_causal_convs:
-                audio_input, skip = layer.add_input(audio_input, skip, cond_input)
+                audio_input, skip = layer.add_input(
+                    audio_input, skip, cond_input)
             
-            # [bs, 1, 128]
-            skip = fluid.layers.transpose(fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
-            # [bs, 1, 3]
+            # [bs, 1, channel]
+            skip = fluid.layers.transpose(
+                fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
             mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
             if loss_type == "softmax":
                 sample = self.sample_softmax(mix_parameters)
@@ -407,17 +370,12 @@ class WaveNetModule(dg.Layer):
             current_sample = fluid.layers.reshape(current_sample,
                 current_sample.shape + [1, 1])
 
-        # syn_audio: (num_samples,)
+        # syn_audio: [num_samples]
         syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
 
         return syn_audio        
 
     def start_new_sequence(self):
         for layer in self.sublayers():
-            if isinstance(layer, weight_norm.Conv1D):
+            if isinstance(layer, conv.Conv1D):
                 layer.start_new_sequence()
-
-    def save(self, iteration):
-        utils.save_latest_parameters(self.checkpoint_dir, iteration,
-                                     self.wavenet, self.optimizer)
-        utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
diff --git a/parakeet/models/wavenet/weight_norm.py b/parakeet/models/wavenet/weight_norm.py
deleted file mode 100644
index 75fe413..0000000
--- a/parakeet/models/wavenet/weight_norm.py
+++ /dev/null
@@ -1,920 +0,0 @@
-import math
-from copy import deepcopy
-
-import numpy as np
-import paddle.fluid.dygraph as dg
-from paddle import fluid
-from paddle.fluid import core
-from paddle.fluid.framework import Variable
-from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
-from paddle.fluid.layers import utils
-from six.moves import reduce
-
-
-def _norm(p, dim):
-    """Computes the norm over all dimensions except dim.
-    It differs from pytorch implementation that it does not keep dim.
-    This difference is related with the broadcast mechanism in paddle.
-    Read elementeise_mul for more.
-    """
-    if dim is None:
-        return np.linalg.norm(p, ord=2, axis=None)
-    elif dim == 0:
-        p = np.reshape(p, newshape=(p.shape[0], -1))
-        return np.linalg.norm(p, ord=2, axis=1)
-    elif dim == p.ndim - 1:
-        p = np.reshape(p, newshape=(-1, p.shape[-1]))
-        return np.linalg.norm(p, ord=2, axis=0)
-    else:
-        perm = list(range(p.ndim))
-        perm[0] = dim
-        perm[dim] = 0
-        return _norm(np.transpose(p, axes=perm))
-
-
-class Conv1D(dg.Layer):
-    """
-    A convolution 1D block implemented with Conv2D. Form simplicity and 
-    ensuring the output has the same length as the input, it does not allow 
-    stride > 1.
-    """
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size=3,
-                 dilation=1,
-                 groups=None,
-                 causal=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 dtype="float32"):
-        super(Conv1D, self).__init__(name_scope, dtype=dtype)
-
-        if causal:
-            padding = dilation * (filter_size - 1)
-        else:
-            padding = (dilation * (filter_size - 1)) // 2
-
-        self.num_filters = num_filters
-        self.filter_size = filter_size
-        self.dilation = dilation
-        self.causal = causal
-        self.padding = padding
-        self.act = act
-
-        self.conv = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=(1, filter_size),
-            stride=(1, 1),
-            dilation=(1, dilation),
-            padding=(0, padding),
-            groups=groups,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            use_cudnn=use_cudnn,
-            act=act,
-            dtype=dtype)
-
-    def forward(self, x):
-        """
-        Args:
-            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
-                input channels.
-        Returns:
-            x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
-                output channels (num_filters).
-        """
-        x = self.conv(x)
-        if self.filter_size > 1:
-            if self.causal:
-                x = fluid.layers.slice(
-                    x, axes=[3], starts=[0], ends=[-self.padding])
-            elif self.filter_size % 2 == 0:
-                x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
-        return x
-
-    def start_new_sequence(self):
-        self.temp_weight = None
-        self.input_buffer = None
-
-    def add_input(self, x):
-        """
-        Adding input for a time step and compute an output for a time step.
-        
-        Args:
-            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
-                input channels, and T = 1.
-        Returns:
-            out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
-            means output channels (num_filters), and T = 1.
-            
-        """
-        if self.temp_weight is None:
-            self.temp_weight = self._reshaped_weight()
-
-        window_size = 1 + (self.filter_size - 1) * self.dilation
-        batch_size = x.shape[0]
-        in_channels = x.shape[1]
-
-        if self.filter_size > 1:
-            if self.input_buffer is None:
-                self.input_buffer = fluid.layers.fill_constant(
-                    [batch_size, in_channels, 1, window_size - 1],
-                    dtype=x.dtype,
-                    value=0.0)
-            else:
-                self.input_buffer = self.input_buffer[:, :, :, 1:]
-            self.input_buffer = fluid.layers.concat(
-                [self.input_buffer, x], axis=3)
-            x = self.input_buffer
-            if self.dilation > 1:
-                if not hasattr(self, "indices"):
-                    self.indices = dg.to_variable(
-                        np.arange(0, window_size, self.dilation))
-                tmp = fluid.layers.transpose(
-                    self.input_buffer, perm=[3, 1, 2, 0])
-                tmp = fluid.layers.gather(tmp, index=self.indices)
-                tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
-                x = tmp
-        inputs = fluid.layers.reshape(
-            x, shape=[batch_size, in_channels * 1 * self.filter_size])
-        out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
-        out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
-        out = fluid.layers.reshape(out, out.shape + [1, 1])
-        out = self._helper.append_activation(out, act=self.act)
-        return out
-
-    def _reshaped_weight(self):
-        """
-        Get the linearized weight of convolution filter, cause it is by nature 
-        a matmul weight. And because the model uses weight norm, compute the
-        weight by weight_v * weight_g to make it faster.
-        Returns:
-            weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
-        """
-        shape = self.conv._filter_param_v.shape
-        matrix_shape = [shape[0], np.prod(shape[1:])]
-        weight_matrix = fluid.layers.reshape(
-            self.conv._filter_param_v, shape=matrix_shape)
-        weight_matrix = fluid.layers.elementwise_mul(
-            fluid.layers.l2_normalize(
-                weight_matrix, axis=1),
-            self.conv._filter_param_g,
-            axis=0)
-        return weight_matrix
-
-
-class FC(dg.Layer):
-    """
-    **Fully Connected Layer**
-    This function creates a fully connected layer in the network. It can take
-    one or multiple tensors as its inputs(input can be a list of Variable, see
-    Args in detail). It creates a pair of variables called (magnitude(g), 
-    direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected 
-    weight matrix from each input unit to each output unit. 
-    The fully connected layer multiplies each input tensor
-    with its corresponding weight to produce an output Tensor with shape [M, `size`],
-    where M is batch size. If multiple input tensors are given, the results of
-    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
-    is not None, a bias variable will be created and added to the output.
-    Finally, if activation is not None, it will be applied to the output as well.
-    When the input is single tensor:
-    .. math::
-        Out = Act({X(normalize(V)g) + b})
-    When the input are multiple tensors:
-    .. math::
-        Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
-    In the above equation:
-    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
-    * :math:`X_i`: The i-th input tensor.
-    * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
-    * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
-    * :math:`b`: The bias parameter created by this layer (if needed).
-    * :math:`Act`: The activation function.
-    * :math:`Out`: The output tensor.
-    See below for an example.
-    .. code-block:: text
-        Given:
-            data_1.data = [[[0.1, 0.2],
-                           [0.3, 0.4]]]
-            data_1.shape = (1, 2, 2) # 1 is batch_size
-            data_2 = [[[0.1, 0.2, 0.3]]]
-            data_2.shape = (1, 1, 3)
-            out = fluid.layers.fc(input=[data_1, data_2], size=2)
-        Then:
-            out.data = [[0.18669507, 0.1893476]]
-            out.shape = (1, 2)
-    Args:
-        name_scope(str): The name of this class.
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multidimensional tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
-        param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act (str|None): Activation to be applied to the output of this layer.
-        is_test(bool): A flag indicating whether execution is in test phase. Default: False
-        dtype(str): Dtype used for weight
-    Raises:
-        ValueError: If rank of the input tensor is less than 2.
-    Examples:
-        .. code-block:: python
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import FC
-          import numpy as np
-          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              fc = FC( "fc", 64, num_flatten_dims=2)
-              data = to_variable( data )
-              conv = fc( data )
-    """
-
-    def __init__(self,
-                 name_scope,
-                 size,
-                 num_flatten_dims=1,
-                 epsilon=1e-30,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None,
-                 is_test=False,
-                 dtype="float32"):
-        super(FC, self).__init__(name_scope, dtype)
-
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._epsilon = epsilon
-        self._dtype = dtype
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self.__g = list()
-        self.__v = list()
-
-    @property
-    def _v(self, i=0):
-        return self.__v[i]
-
-    @property
-    def _g(self, i=0):
-        return self.__g[i]
-
-    @_v.setter
-    def _v(self, value, i=0):
-        assert isinstance(value, Parameter)
-        self.__v[i] = value
-
-    @_g.setter
-    def _g(self, value, i=0):
-        assert isinstance(value, Parameter)
-        self.__g[i] = value
-
-    def _build_once(self, input):
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(
-                input, self._param_attr):
-            input_shape = inp.shape
-
-            param_shape = [
-                reduce(lambda a, b: a * b,
-                       input_shape[self._num_flatten_dims:], 1)
-            ] + [self._size]
-            self.__v.append(
-                self.add_parameter(
-                    "_v%d" % i,
-                    self.create_parameter(
-                        attr=param,
-                        shape=param_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
-
-            magnitude_shape = param_shape[1:]
-            magnitude_value = np.linalg.norm(
-                self.__v[i].numpy(), ord=2, axis=0)
-
-            self.__g.append(
-                self.add_parameter(
-                    "_g%d" % i,
-                    self.create_parameter(
-                        attr=fluid.ParamAttr(initializer=fluid.initializer.
-                                             NumpyArrayInitializer(
-                                                 magnitude_value)),
-                        shape=magnitude_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
-            i += 1
-
-        size = list([self._size])
-        self._b = self.create_parameter(
-            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
-
-    def forward(self, input):
-        mul_results = list()
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(
-                input, self._param_attr):
-            v_norm = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            v_normalized = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="norm",
-                inputs={"X": self.__v[i]},
-                outputs={"Out": v_normalized,
-                         "Norm": v_norm},
-                attrs={"axis": 0,
-                       "epsilon": self._epsilon})
-            weight = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="elementwise_mul",
-                inputs={"X": [v_normalized],
-                        "Y": [self.__g[i]]},
-                outputs={"Out": [weight]},
-                attrs={"axis": 1})
-            tmp = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="mul",
-                inputs={"X": inp,
-                        "Y": weight},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": self._num_flatten_dims,
-                    "y_num_col_dims": 1
-                })
-            i += 1
-            mul_results.append(tmp)
-
-        if len(mul_results) == 1:
-            pre_bias = mul_results[0]
-        else:
-            pre_bias = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False})
-
-        if self._b:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type="elementwise_add",
-                inputs={"X": [pre_bias],
-                        "Y": [self._b]},
-                outputs={"Out": [pre_activation]},
-                attrs={"axis": self._num_flatten_dims})
-        else:
-            pre_activation = pre_bias
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_activation, act=self._act)
-
-
-class Conv2D(dg.Layer):
-    """
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    channels, H is the height of the feature, and W is the width of the feature.
-    Filter is in MCHW format, where M is the number of output image channels,
-    C is the number of input image channels, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input image channels divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
-    for more detials.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma ((Vg) \\ast X + b)
-    Where:
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`V`: Filter direction value, a tensor with MCHW format.
-    * :math:`g`: Filter magnitude value, a tensor with M format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-    Args:
-        name_scope(str) : The name for this class.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-    Examples:
-        .. code-block:: python
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import Conv2D
-          import numpy as np
-          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              conv2d = Conv2D( "conv2d", 2, 3)
-              data = to_variable( data )
-              conv = conv2d( data )
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 epsilon=1e-30,
-                 dtype="float32"):
-        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name_scope, dtype)
-        self._groups = groups
-        self._stride = utils.convert_to_list(stride, 2, "stride")
-        self._padding = utils.convert_to_list(padding, 2, "padding")
-        self._dilation = utils.convert_to_list(dilation, 2, "dilation")
-        self._act = act
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-        self._filter_size = filter_size
-        self._num_filters = num_filters
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._epsilon = epsilon
-        self._dtype = dtype
-        # if (self._num_channels == self._groups and
-        #         num_filters % self._num_channels == 0 and not self._use_cudnn):
-        #     self._l_type = 'depthwise_conv2d'
-        # else:
-        # TODO(jiabin): recover the usage of depthwise_conv2d when it's
-        #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
-        self._l_type = "conv2d"
-
-    def _build_once(self, input):
-        self._num_channels = input.shape[1]
-        if self._groups is None:
-            num_filter_channels = self._num_channels
-        else:
-            if self._num_channels % self._groups != 0:
-                raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = self._num_channels // self._groups
-        filter_size = utils.convert_to_list(self._filter_size, 2,
-                                            "filter_size")
-        filter_shape = [self._num_filters, int(num_filter_channels)
-                        ] + filter_size
-
-        def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[
-                1] * self._num_channels
-            std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
-
-        # weight_v
-        self._filter_param_v = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer())
-
-        # weight_g
-        norm_value = _norm(
-            self._filter_param_v.numpy(), dim=0)  # CAUTION: hard-code
-        self._filter_param_g = self.create_parameter(
-            attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    norm_value)),
-            shape=norm_value.shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer())
-
-        if self._use_cudnn:
-            self.create_variable(
-                name="kCUDNNFwdAlgoCache",
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-            self.create_variable(
-                name="kCUDNNBwdDataAlgoCache",
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-            self.create_variable(
-                name="kCUDNNBwdFilterAlgoCache",
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        matrix = self._helper.create_variable_for_type_inference(self._dtype)
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        new_shape = [
-            self._filter_param_v.shape[0],
-            reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
-        ]
-
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": self._filter_param_v},
-            attrs={"shape": new_shape},
-            outputs={"Out": matrix,
-                     "XShape": tmp})
-
-        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
-        m_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="norm",
-            inputs={"X": matrix},
-            outputs={"Out": m_normalized,
-                     "Norm": m_norm},
-            attrs={"axis": 1,
-                   "epsilon": self._epsilon})
-
-        v_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": m_normalized},
-            attrs={"shape": self._filter_param_v.shape},
-            outputs={"Out": v_normalized,
-                     "XShape": tmp2})
-
-        filter_param = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="elementwise_mul",
-            inputs={"X": [v_normalized],
-                    "Y": [self._filter_param_g]},
-            outputs={"Out": [filter_param]},
-            attrs={"axis": 0},  # CAUTION: hard-code
-        )
-
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={"Input": input,
-                    "Filter": filter_param},
-            outputs={"Output": pre_bias},
-            attrs={
-                "strides": self._stride,
-                "paddings": self._padding,
-                "dilations": self._dilation,
-                "groups": self._groups if self._groups else 1,
-                "use_cudnn": self._use_cudnn,
-                "use_mkldnn": False,
-            })
-
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type="elementwise_add",
-                inputs={"X": [pre_bias],
-                        "Y": [self._bias_param]},
-                outputs={"Out": [pre_act]},
-                attrs={"axis": 1})
-        else:
-            pre_act = pre_bias
-
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Conv2DTranspose(dg.Layer):
-    """
-    **Convlution2D transpose layer**
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCHW format. Where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-    Parameters(dilations, strides, paddings) are two elements. These two elements
-    represent height and width, respectively. The details of convolution transpose
-    layer, please refer to the following explanation and references
-    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma ((Vg) \\ast X + b)
-    Where:
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`V`: Filter value, a tensor with MCHW format.
-    * :math:`g`: Filter value, a tensor with M format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-    Args:
-        name_scope(str): The name of this class.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square. None if use output size to
-            calculate filter_size. Default: None.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups(int): The groups number of the Conv2d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups = 1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-    Returns:
-        Variable: The tensor variable storing the convolution transpose result.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-    Examples:
-       .. code-block:: python
-          import paddle.fluid as fluid
-          import numpy
-          with fluid.dygraph.guard():
-              data = numpy.random.random((3, 32, 32)).astype('float32')
-              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
-                    'Conv2DTranspose', num_filters=2, filter_size=3)
-              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 output_size=None,
-                 filter_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 epsilon=1e-30,
-                 act=None,
-                 dtype="float32"):
-        super(Conv2DTranspose, self).__init__(name_scope, dtype)
-        assert (param_attr is not False
-                ), "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._groups = groups
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._padding = padding
-        self._stride = stride
-        self._dilation = dilation
-        self._filter_size = filter_size
-        self._output_size = output_size
-        self._op_type = "conv2d_transpose"
-        self._epsilon = epsilon
-
-    def _build_once(self, input):
-        input_channel = input.shape[1]
-        if (input_channel == self._groups and
-                self._num_filters == input_channel and not self._use_cudnn):
-            self._op_type = "depthwise_conv2d_transpose"
-
-        if not isinstance(input, Variable):
-            raise TypeError("Input of conv2d_transpose must be Variable")
-
-        self._padding = utils.convert_to_list(self._padding, 2, "padding")
-        self._stride = utils.convert_to_list(self._stride, 2, "stride")
-        self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
-
-        if not isinstance(self._use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-
-        if self._filter_size is None:
-            if self._output_size is None:
-                raise ValueError(
-                    "output_size must be set when filter_size is None")
-            if isinstance(self._output_size, int):
-                self._output_size = [self._output_size, self._output_size]
-
-            h_in = input.shape[2]
-            w_in = input.shape[3]
-
-            filter_size_h = (self._output_size[0] -
-                             (h_in - 1) * self._stride[0] + 2 *
-                             self._padding[0] - 1) // self._dilation[0] + 1
-            filter_size_w = (self._output_size[1] -
-                             (w_in - 1) * self._stride[1] + 2 *
-                             self._padding[1] - 1) // self._dilation[1] + 1
-            self._filter_size = [filter_size_h, filter_size_w]
-        else:
-            self._filter_size = utils.convert_to_list(
-                self._filter_size, 2, "conv2d_transpose.filter_size")
-
-        if self._output_size is None:
-            self._output_size = []
-        elif isinstance(self._output_size, list) or isinstance(
-                self._output_size, int):
-            self._output_size = utils.convert_to_list(self._output_size, 2,
-                                                      "output_size")
-        else:
-            raise ValueError("output_size should be list or int")
-        self._padding = utils.convert_to_list(self._padding, 2, "padding")
-        self._groups = 1 if self._groups is None else self._groups
-        filter_shape = [
-            input_channel,
-            self._num_filters // self._groups,
-        ] + self._filter_size
-
-        # img filter v (direction)
-        self._img_filter_v = self.create_parameter(
-            dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
-
-        # img filter g (magnitude)
-        img_filter_magnitude = _norm(
-            self._img_filter_v.numpy(), dim=0)  # CAUTION: hard-code
-        self._img_filter_g = self.create_parameter(
-            dtype=input.dtype,
-            shape=img_filter_magnitude.shape,
-            attr=fluid.ParamAttr(
-                initializer=NumpyArrayInitializer(img_filter_magnitude)))
-
-        self._img_bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        matrix = self._helper.create_variable_for_type_inference(self._dtype)
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        new_shape = [
-            self._img_filter_v.shape[0],
-            reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
-        ]
-
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": self._img_filter_v},
-            attrs={"shape": new_shape},
-            outputs={"Out": matrix,
-                     "XShape": tmp})
-
-        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
-        m_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="norm",
-            inputs={"X": matrix},
-            outputs={"Out": m_normalized,
-                     "Norm": m_norm},
-            attrs={"axis": 1,
-                   "epsilon": self._epsilon})
-
-        v_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": m_normalized},
-            attrs={"shape": self._img_filter_v.shape},
-            outputs={"Out": v_normalized,
-                     "XShape": tmp2})
-
-        img_filter = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="elementwise_mul",
-            inputs={"X": [v_normalized],
-                    "Y": [self._img_filter_g]},
-            outputs={"Out": [img_filter]},
-            attrs={"axis": 0},  # CAUTION: hard-code
-        )
-
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        self._helper.append_op(
-            type=self._op_type,
-            inputs={"Input": [input],
-                    "Filter": [img_filter]},
-            outputs={"Output": pre_bias},
-            attrs={
-                "output_size": self._output_size,
-                "strides": self._stride,
-                "paddings": self._padding,
-                "dilations": self._dilation,
-                "groups": self._groups,
-                "use_cudnn": self._use_cudnn,
-            })
-
-        if self._img_bias is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type="elementwise_add",
-                inputs={"X": [pre_bias],
-                        "Y": [self._img_bias]},
-                outputs={"Out": [pre_act]},
-                attrs={"axis": 1})
-        else:
-            pre_act = pre_bias
-
-        out = self._helper.append_activation(pre_act)
-        return out
diff --git a/parakeet/modules/modules.py b/parakeet/modules/modules.py
index 4fb92ed..7aef463 100644
--- a/parakeet/modules/modules.py
+++ b/parakeet/modules/modules.py
@@ -26,6 +26,7 @@ def FC(name_scope,
        in_features,
        size,
        num_flatten_dims=1,
+       relu=False,
        dropout=0.0,
        epsilon=1e-30,
        act=None,
@@ -39,7 +40,11 @@ def FC(name_scope,
     # stds
     if isinstance(in_features, int):
         in_features = [in_features]
+
     stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
+    if relu:
+        stds = [std * np.sqrt(2.0) for std in stds]
+
     weight_inits = [
         fluid.initializer.NormalInitializer(scale=std) for std in stds
     ]
@@ -456,3 +461,152 @@ class PositionEmbedding(dg.Layer):
             return out
         else:
             raise Exception("Then you can just use position rate at init")
+
+
+class Conv1D_GU(dg.Layer):
+    def __init__(self,
+                 name_scope,
+                 conditioner_dim,
+                 in_channels,
+                 num_filters,
+                 filter_size,
+                 dilation,
+                 causal=False,
+                 residual=True,
+                 dtype="float32"):
+        super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)
+
+        self.conditioner_dim = conditioner_dim
+        self.in_channels = in_channels
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.dilation = dilation
+        self.causal = causal
+        self.residual = residual
+
+        if residual:
+            assert (
+                in_channels == num_filters
+            ), "this block uses residual connection"\
+                "the input_channels should equals num_filters"
+
+        self.conv = Conv1D(
+            self.full_name(),
+            in_channels,
+            2 * num_filters,
+            filter_size,
+            dilation,
+            causal=causal,
+            dtype=dtype)
+
+        self.fc = Conv1D(
+            self.full_name(),
+            conditioner_dim,
+            2 * num_filters,
+            filter_size=1,
+            dilation=1,
+            causal=False,
+            dtype=dtype)
+
+    def forward(self, x, skip=None, conditioner=None):
+        """
+        Args:
+            x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU
+                layer, where B means batch_size, C_in means the input channels
+                T means input time steps.
+            skip (Variable): Shape(B, C_in, 1, T), skip connection.
+            conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
+                conditioner, where C_con is conditioner hidden dim which
+                equals the num of mel bands. Note that when using residual
+                connection, the Conv1D_GU does not change the number of
+                channels, so out channels equals input channels.
+        Returns:
+            x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where
+                C_out means the output channels of Conv1D_GU.
+            skip (Variable): Shape(B, C_out, 1, T), skip connection.
+        """
+        residual = x
+        x = self.conv(x)
+
+        if conditioner is not None:
+            cond_bias = self.fc(conditioner)
+            x += cond_bias
+
+        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
+
+        # Gated Unit.
+        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
+                                         fluid.layers.tanh(content))
+
+        if skip is None:
+            skip = x
+        else:
+            skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
+
+        if self.residual:
+            x = fluid.layers.scale(residual + x, np.sqrt(0.5))
+
+        return x, skip
+
+    def add_input(self, x, skip=None, conditioner=None):
+        """
+        Inputs:
+            x: shape(B, num_filters, 1, time_steps)
+            skip: shape(B, num_filters, 1, time_steps), skip connection
+            conditioner: shape(B, conditioner_dim, 1, time_steps)
+        Outputs:
+            x: shape(B, num_filters, 1, time_steps), where time_steps = 1
+            skip: skip connection, same shape as x
+        """
+        residual = x
+
+        # add step input and produce step output
+        x = self.conv.add_input(x)
+
+        if conditioner is not None:
+            cond_bias = self.fc(conditioner)
+            x += cond_bias
+
+        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
+
+        # Gated Unit.
+        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
+                                         fluid.layers.tanh(content))
+
+        if skip is None:
+            skip = x
+        else:
+            skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
+
+        if self.residual:
+            x = fluid.layers.scale(residual + x, np.sqrt(0.5))
+
+        return x, skip
+
+
+def Conv2DTranspose(name_scope,
+                    num_filters,
+                    filter_size,
+                    padding=0,
+                    stride=1,
+                    dilation=1,
+                    use_cudnn=True,
+                    act=None,
+                    dtype="float32"):
+    val = 1.0 / (filter_size[0] * filter_size[1])
+    weight_init = fluid.initializer.ConstantInitializer(val)
+    weight_attr = fluid.ParamAttr(initializer=weight_init)
+
+    layer = weight_norm.Conv2DTranspose(
+        name_scope,
+        num_filters,
+        filter_size=filter_size,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        param_attr=weight_attr,
+        use_cudnn=use_cudnn,
+        act=act,
+        dtype=dtype)
+
+    return layer