diff --git a/parakeet/deepvoice3/README.md b/parakeet/models/deepvoice3/README.md
similarity index 100%
rename from parakeet/deepvoice3/README.md
rename to parakeet/models/deepvoice3/README.md
diff --git a/parakeet/deepvoice3/README_cn.md b/parakeet/models/deepvoice3/README_cn.md
similarity index 100%
rename from parakeet/deepvoice3/README_cn.md
rename to parakeet/models/deepvoice3/README_cn.md
diff --git a/parakeet/deepvoice3/_ce.py b/parakeet/models/deepvoice3/_ce.py
similarity index 100%
rename from parakeet/deepvoice3/_ce.py
rename to parakeet/models/deepvoice3/_ce.py
diff --git a/parakeet/deepvoice3/_images/model_architecture.png b/parakeet/models/deepvoice3/_images/model_architecture.png
similarity index 100%
rename from parakeet/deepvoice3/_images/model_architecture.png
rename to parakeet/models/deepvoice3/_images/model_architecture.png
diff --git a/parakeet/deepvoice3/audio.py b/parakeet/models/deepvoice3/audio.py
similarity index 100%
rename from parakeet/deepvoice3/audio.py
rename to parakeet/models/deepvoice3/audio.py
diff --git a/parakeet/deepvoice3/builder.py b/parakeet/models/deepvoice3/builder.py
similarity index 100%
rename from parakeet/deepvoice3/builder.py
rename to parakeet/models/deepvoice3/builder.py
diff --git a/parakeet/deepvoice3/compute_timestamp_ratio.py b/parakeet/models/deepvoice3/compute_timestamp_ratio.py
similarity index 100%
rename from parakeet/deepvoice3/compute_timestamp_ratio.py
rename to parakeet/models/deepvoice3/compute_timestamp_ratio.py
diff --git a/parakeet/deepvoice3/data.py b/parakeet/models/deepvoice3/data.py
similarity index 100%
rename from parakeet/deepvoice3/data.py
rename to parakeet/models/deepvoice3/data.py
diff --git a/parakeet/deepvoice3/deepvoice3.py b/parakeet/models/deepvoice3/deepvoice3.py
similarity index 100%
rename from parakeet/deepvoice3/deepvoice3.py
rename to parakeet/models/deepvoice3/deepvoice3.py
diff --git a/parakeet/deepvoice3/dry_run.py b/parakeet/models/deepvoice3/dry_run.py
similarity index 100%
rename from parakeet/deepvoice3/dry_run.py
rename to parakeet/models/deepvoice3/dry_run.py
diff --git a/parakeet/deepvoice3/eval_model.py b/parakeet/models/deepvoice3/eval_model.py
similarity index 100%
rename from parakeet/deepvoice3/eval_model.py
rename to parakeet/models/deepvoice3/eval_model.py
diff --git a/parakeet/deepvoice3/hparams.py b/parakeet/models/deepvoice3/hparams.py
similarity index 100%
rename from parakeet/deepvoice3/hparams.py
rename to parakeet/models/deepvoice3/hparams.py
diff --git a/parakeet/deepvoice3/ljspeech.py b/parakeet/models/deepvoice3/ljspeech.py
similarity index 100%
rename from parakeet/deepvoice3/ljspeech.py
rename to parakeet/models/deepvoice3/ljspeech.py
diff --git a/parakeet/deepvoice3/preprocess.py b/parakeet/models/deepvoice3/preprocess.py
similarity index 100%
rename from parakeet/deepvoice3/preprocess.py
rename to parakeet/models/deepvoice3/preprocess.py
diff --git a/parakeet/deepvoice3/presets/deepvoice3_ljspeech.json b/parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json
similarity index 100%
rename from parakeet/deepvoice3/presets/deepvoice3_ljspeech.json
rename to parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json
diff --git a/parakeet/deepvoice3/synthesis.py b/parakeet/models/deepvoice3/synthesis.py
similarity index 100%
rename from parakeet/deepvoice3/synthesis.py
rename to parakeet/models/deepvoice3/synthesis.py
diff --git a/parakeet/deepvoice3/train.py b/parakeet/models/deepvoice3/train.py
similarity index 100%
rename from parakeet/deepvoice3/train.py
rename to parakeet/models/deepvoice3/train.py
diff --git a/parakeet/deepvoice3/train.sh b/parakeet/models/deepvoice3/train.sh
similarity index 100%
rename from parakeet/deepvoice3/train.sh
rename to parakeet/models/deepvoice3/train.sh
diff --git a/parakeet/deepvoice3/train_model.py b/parakeet/models/deepvoice3/train_model.py
similarity index 100%
rename from parakeet/deepvoice3/train_model.py
rename to parakeet/models/deepvoice3/train_model.py
diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py
new file mode 100644
index 0000000..34149be
--- /dev/null
+++ b/parakeet/modules/conv.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+
+import paddle
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+
+from weight_norm import Conv2D, Conv2DTranspose
+
+
+class Conv1D(dg.Layer):
+ """
+ A convolution 1D block implemented with Conv2D. Form simplicity and
+ ensuring the output has the same length as the input, it does not allow
+ stride > 1.
+ """
+
+ def __init__(self,
+ name_scope,
+ in_cahnnels,
+ num_filters,
+ filter_size=3,
+ dilation=1,
+ groups=None,
+ causal=False,
+ param_attr=None,
+ bias_attr=None,
+ use_cudnn=True,
+ act=None,
+ dtype="float32"):
+ super(Conv1D, self).__init__(name_scope, dtype=dtype)
+
+ if causal:
+ padding = dilation * (filter_size - 1)
+ else:
+ padding = (dilation * (filter_size - 1)) // 2
+
+ self.in_channels = in_cahnnels
+ self.num_filters = num_filters
+ self.filter_size = filter_size
+ self.dilation = dilation
+ self.causal = causal
+ self.padding = padding
+ self.act = act
+
+ self.conv = Conv2D(
+ self.full_name(),
+ num_filters=num_filters,
+ filter_size=(1, filter_size),
+ stride=(1, 1),
+ dilation=(1, dilation),
+ padding=(0, padding),
+ groups=groups,
+ param_attr=param_attr,
+ bias_attr=bias_attr,
+ use_cudnn=use_cudnn,
+ act=act,
+ dtype=dtype)
+
+ def forward(self, x):
+ """
+ Args:
+ x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
+ input channels.
+
+ Returns:
+ x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
+ output channels (num_filters).
+ """
+ x = self.conv(x)
+ if self.filter_size > 1:
+ if self.causal:
+ x = fluid.layers.slice(
+ x, axes=[3], starts=[0], ends=[-self.padding])
+ elif self.filter_size % 2 == 0:
+ x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
+ return x
+
+ def start_new_sequence(self):
+ self.temp_weight = None
+ self.input_buffer = None
+
+ def add_input(self, x):
+ """
+ Adding input for a time step and compute an output for a time step.
+
+ Args:
+ x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
+ input channels, and T = 1.
+
+ Returns:
+ out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
+ means output channels (num_filters), and T = 1.
+
+ """
+ if self.temp_weight is None:
+ self.temp_weight = self._reshaped_weight()
+
+ window_size = 1 + (self.filter_size - 1) * self.dilation
+ batch_size = x.shape[0]
+ in_channels = x.shape[1]
+
+ if self.filter_size > 1:
+ if self.input_buffer is None:
+ self.input_buffer = fluid.layers.fill_constant(
+ [batch_size, in_channels, 1, window_size - 1],
+ dtype=x.dtype,
+ value=0.0)
+ else:
+ self.input_buffer = self.input_buffer[:, :, :, 1:]
+ self.input_buffer = fluid.layers.concat(
+ [self.input_buffer, x], axis=3)
+ x = self.input_buffer
+ if self.dilation > 1:
+ if not hasattr(self, "indices"):
+ self.indices = dg.to_variable(
+ np.arange(0, window_size, self.dilation))
+ tmp = fluid.layers.transpose(
+ self.input_buffer, perm=[3, 1, 2, 0])
+ tmp = fluid.layers.gather(tmp, index=self.indices)
+ tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
+ x = tmp
+ inputs = fluid.layers.reshape(
+ x, shape=[batch_size, in_channels * 1 * self.filter_size])
+ out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
+ out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
+ out = fluid.layers.reshape(out, out.shape + [1, 1])
+ out = self._helper.append_activation(out, act=self.act)
+ return out
+
+ def _reshaped_weight(self):
+ """
+ Get the linearized weight of convolution filter, cause it is by nature
+ a matmul weight. And because the model uses weight norm, compute the
+ weight by weight_v * weight_g to make it faster.
+
+ Returns:
+ weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
+ """
+ shape = self.conv._filter_param_v.shape
+ matrix_shape = [shape[0], np.prod(shape[1:])]
+ weight_matrix = fluid.layers.reshape(
+ self.conv._filter_param_v, shape=matrix_shape)
+ weight_matrix = fluid.layers.elementwise_mul(
+ fluid.layers.l2_normalize(
+ weight_matrix, axis=1),
+ self.conv._filter_param_g,
+ axis=0)
+ return weight_matrix
+
+
+class Conv1DTranspose(dg.Layer):
+ """
+ A convolutional transpose 1D block implemented with convolutional transpose
+ 2D. It does not ensure that the output is exactly expanded stride times in
+ time dimension.
+ """
+
+ def __init__(self,
+ name_scope,
+ in_channels,
+ num_filters,
+ filter_size,
+ padding=0,
+ stride=1,
+ dilation=1,
+ groups=None,
+ param_attr=None,
+ bias_attr=None,
+ use_cudnn=True,
+ act=None,
+ dtype="float32"):
+ super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype)
+
+ self.in_channels = in_channels
+ self.num_filters = num_filters
+ self.filter_size = filter_size
+ self.padding = padding
+ self.stride = stride
+ self.dilation = dilation
+ self.groups = groups
+
+ self.conv_transpose = Conv2DTranspose(
+ self.full_name(),
+ num_filters,
+ filter_size=(1, filter_size),
+ padding=(0, padding),
+ stride=(1, stride),
+ dilation=(1, dilation),
+ groups=groups,
+ param_attr=param_attr,
+ bias_attr=bias_attr,
+ use_cudnn=use_cudnn,
+ act=act,
+ dtype=dtype)
+
+ def forward(self, x):
+ """
+ Argss:
+ x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input
+ channels and T_in means the number of time steps of input.
+
+ Returns:
+ out (Variable): shape(B, C_out, 1, T_out), where C_out means the
+ output channels and T_out means the number of time steps of
+ input.
+ """
+ return self.conv_transpose(x)
diff --git a/parakeet/modules/loss.py b/parakeet/modules/loss.py
new file mode 100644
index 0000000..96bcd3b
--- /dev/null
+++ b/parakeet/modules/loss.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from numba import jit
+
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+
+
+def masked_mean(inputs, mask):
+ """
+ Args:
+ inputs (Variable): Shape(B, C, 1, T), the input, where B means
+ batch size, C means channels of input, T means timesteps of
+ the input.
+ mask (Variable): Shape(B, T), a mask.
+ Returns:
+ loss (Variable): Shape(1, ), masked mean.
+ """
+ channels = inputs.shape[1]
+ reshaped_mask = fluid.layers.reshape(
+ mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
+ expanded_mask = fluid.layers.expand(
+ reshaped_mask, expand_times=[1, channels, 1, 1])
+ expanded_mask.stop_gradient = True
+
+ valid_cnt = fluid.layers.reduce_sum(expanded_mask)
+ valid_cnt.stop_gradient = True
+
+ masked_inputs = inputs * expanded_mask
+ loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
+ return loss
+
+
+@jit(nopython=True)
+def guided_attention(N, max_N, T, max_T, g):
+ W = np.zeros((max_N, max_T), dtype=np.float32)
+ for n in range(N):
+ for t in range(T):
+ W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
+ return W
+
+
+def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
+ B = len(input_lengths)
+ max_input_len = input_lengths.max()
+ W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
+ for b in range(B):
+ W[b] = guided_attention(input_lengths[b], max_input_len,
+ target_lengths[b], max_target_len, g).T
+ return W
+
+
+class TTSLoss(object):
+ def __init__(self,
+ masked_weight=0.0,
+ priority_weight=0.0,
+ binary_divergence_weight=0.0,
+ guided_attention_sigma=0.2):
+ self.masked_weight = masked_weight
+ self.priority_weight = priority_weight
+ self.binary_divergence_weight = binary_divergence_weight
+ self.guided_attention_sigma = guided_attention_sigma
+
+ def l1_loss(self, prediction, target, mask, priority_bin=None):
+ abs_diff = fluid.layers.abs(prediction - target)
+
+ # basic mask-weighted l1 loss
+ w = self.masked_weight
+ if w > 0 and mask is not None:
+ base_l1_loss = w * masked_mean(abs_diff, mask) + (
+ 1 - w) * fluid.layers.reduce_mean(abs_diff)
+ else:
+ base_l1_loss = fluid.layers.reduce_mean(abs_diff)
+
+ if self.priority_weight > 0 and priority_bin is not None:
+ # mask-weighted priority channels' l1-loss
+ priority_abs_diff = fluid.layers.slice(
+ abs_diff, axes=[1], starts=[0], ends=[priority_bin])
+ if w > 0 and mask is not None:
+ priority_loss = w * masked_mean(priority_abs_diff, mask) + (
+ 1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
+ else:
+ priority_loss = fluid.layers.reduce_mean(priority_abs_diff)
+
+ # priority weighted sum
+ p = self.priority_weight
+ loss = p * priority_loss + (1 - p) * base_l1_loss
+ else:
+ loss = base_l1_loss
+ return loss
+
+ def binary_divergence(self, prediction, target, mask):
+ flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
+ flattened_target = fluid.layers.reshape(target, [-1, 1])
+ flattened_loss = fluid.layers.log_loss(
+ flattened_prediction, flattened_target, epsilon=1e-8)
+ bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
+
+ w = self.masked_weight
+ if w > 0 and mask is not None:
+ loss = w * masked_mean(bin_div, mask) + (
+ 1 - w) * fluid.layers.reduce_mean(bin_div)
+ else:
+ loss = fluid.layers.reduce_mean(bin_div)
+ return loss
+
+ @staticmethod
+ def done_loss(done_hat, done):
+ flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
+ flat_done = fluid.layers.reshape(done, [-1, 1])
+ loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
+ loss = fluid.layers.reduce_mean(loss)
+ return loss
+
+ def attention_loss(self, predicted_attention, input_lengths,
+ target_lengths):
+ """
+ Given valid encoder_lengths and decoder_lengths, compute a diagonal
+ guide, and compute loss from the predicted attention and the guide.
+
+ Args:
+ predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the
+ alignment tensor, where B means batch size, T_dec means number
+ of time steps of the decoder, T_enc means the number of time
+ steps of the encoder, * means other possible dimensions.
+ input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
+ (time steps) of encoder outputs.
+ target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64,
+ valid lengths (time steps) of decoder outputs.
+
+ Returns:
+ loss (Variable): Shape(1, ) attention loss.
+ """
+ n_attention, batch_size, max_target_len, max_input_len = (
+ predicted_attention.shape)
+ soft_mask = guided_attentions(input_lengths, target_lengths,
+ max_target_len,
+ self.guided_attention_sigma)
+ soft_mask_ = dg.to_variable(soft_mask)
+ loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
+ return loss
diff --git a/parakeet/modules/modules.py b/parakeet/modules/modules.py
new file mode 100644
index 0000000..3ae95d7
--- /dev/null
+++ b/parakeet/modules/modules.py
@@ -0,0 +1,458 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+
+import numpy as np
+
+import conv
+import weight_norm as weight_norm
+
+
+def FC(name_scope,
+ in_features,
+ size,
+ num_flatten_dims=1,
+ dropout=0.0,
+ epsilon=1e-30,
+ act=None,
+ is_test=False,
+ dtype="float32"):
+ """
+ A special Linear Layer, when it is used with dropout, the weight is
+ initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
+ """
+
+ # stds
+ if isinstance(in_features, int):
+ in_features = [in_features]
+ stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
+ weight_inits = [
+ fluid.initializer.NormalInitializer(scale=std) for std in stds
+ ]
+ bias_init = fluid.initializer.ConstantInitializer(0.0)
+
+ # param attrs
+ weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
+ bias_attr = fluid.ParamAttr(initializer=bias_init)
+
+ layer = weight_norm.FC(name_scope,
+ size,
+ num_flatten_dims=num_flatten_dims,
+ param_attr=weight_attrs,
+ bias_attr=bias_attr,
+ act=act,
+ dtype=dtype)
+ return layer
+
+
+def Conv1D(name_scope,
+ in_channels,
+ num_filters,
+ filter_size=3,
+ dilation=1,
+ groups=None,
+ causal=False,
+ std_mul=1.0,
+ dropout=0.0,
+ use_cudnn=True,
+ act=None,
+ dtype="float32"):
+ """
+ A special Conv1D Layer, when it is used with dropout, the weight is
+ initialized as
+ normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
+ """
+ # std
+ std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
+ weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
+ bias_init = fluid.initializer.ConstantInitializer(0.0)
+
+ # param attrs
+ weight_attr = fluid.ParamAttr(initializer=weight_init)
+ bias_attr = fluid.ParamAttr(initializer=bias_init)
+
+ layer = conv.Conv1D(
+ name_scope,
+ in_channels,
+ num_filters,
+ filter_size,
+ dilation,
+ groups=groups,
+ causal=causal,
+ param_attr=weight_attr,
+ bias_attr=bias_attr,
+ use_cudnn=use_cudnn,
+ act=act,
+ dtype=dtype)
+ return layer
+
+
+def Embedding(name_scope,
+ num_embeddings,
+ embed_dim,
+ is_sparse=False,
+ is_distributed=False,
+ padding_idx=None,
+ std=0.01,
+ dtype="float32"):
+ # param attrs
+ weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
+ scale=std))
+ layer = dg.Embedding(
+ name_scope, (num_embeddings, embed_dim),
+ padding_idx=padding_idx,
+ param_attr=weight_attr,
+ dtype=dtype)
+ return layer
+
+
+class Conv1DGLU(dg.Layer):
+ """
+ A Convolution 1D block with GLU activation. It also applys dropout for the
+ input x. It fuses speaker embeddings through a FC activated by softsign. It
+ has residual connection from the input x, and scale the output by
+ np.sqrt(0.5).
+ """
+
+ def __init__(self,
+ name_scope,
+ n_speakers,
+ speaker_dim,
+ in_channels,
+ num_filters,
+ filter_size,
+ dilation,
+ std_mul=4.0,
+ dropout=0.0,
+ causal=False,
+ residual=True,
+ dtype="float32"):
+ super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
+
+ # conv spec
+ self.in_channels = in_channels
+ self.n_speakers = n_speakers
+ self.speaker_dim = speaker_dim
+ self.num_filters = num_filters
+ self.filter_size = filter_size
+ self.dilation = dilation
+ self.causal = causal
+ self.residual = residual
+
+ # weight init and dropout
+ self.std_mul = std_mul
+ self.dropout = dropout
+
+ if residual:
+ assert (
+ in_channels == num_filters
+ ), "this block uses residual connection"\
+ "the input_channes should equals num_filters"
+
+ self.conv = Conv1D(
+ self.full_name(),
+ in_channels,
+ 2 * num_filters,
+ filter_size,
+ dilation,
+ causal=causal,
+ std_mul=std_mul,
+ dropout=dropout,
+ dtype=dtype)
+
+ if n_speakers > 1:
+ assert (speaker_dim is not None
+ ), "speaker embed should not be null in multi-speaker case"
+ self.fc = Conv1D(
+ self.full_name(),
+ speaker_dim,
+ num_filters,
+ filter_size=1,
+ dilation=1,
+ causal=False,
+ act="softsign",
+ dtype=dtype)
+
+ def forward(self, x, speaker_embed_bc1t=None):
+ """
+ Args:
+ x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
+ layer, where B means batch_size, C_in means the input channels
+ T means input time steps.
+ speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
+ speaker embed, where C_sp means speaker embedding size. Note
+ that when using residual connection, the Conv1DGLU does not
+ change the number of channels, so out channels equals input
+ channels.
+
+ Returns:
+ x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
+ C_out means the output channels of Conv1DGLU.
+ """
+
+ residual = x
+ x = fluid.layers.dropout(
+ x, self.dropout, dropout_implementation="upscale_in_train")
+ x = self.conv(x)
+
+ content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
+
+ if speaker_embed_bc1t is not None:
+ sp = self.fc(speaker_embed_bc1t)
+ content = content + sp
+
+ # glu
+ x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
+
+ if self.residual:
+ x = fluid.layers.scale(x + residual, np.sqrt(0.5))
+ return x
+
+ def add_input(self, x, speaker_embed_bc11=None):
+ """
+ Inputs:
+ x: shape(B, num_filters, 1, time_steps)
+ speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
+
+ Outputs:
+ out: shape(B, num_filters, 1, time_steps), where time_steps = 1
+ """
+
+ residual = x
+
+ # add step input and produce step output
+ x = fluid.layers.dropout(
+ x, self.dropout, dropout_implementation="upscale_in_train")
+ x = self.conv.add_input(x)
+
+ content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
+
+ if speaker_embed_bc11 is not None:
+ sp = self.fc(speaker_embed_bc11)
+ content = content + sp
+
+ x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
+
+ if self.residual:
+ x = fluid.layers.scale(x + residual, np.sqrt(0.5))
+ return x
+
+
+def Conv1DTranspose(name_scope,
+ in_channels,
+ num_filters,
+ filter_size,
+ padding=0,
+ stride=1,
+ dilation=1,
+ groups=None,
+ std_mul=1.0,
+ dropout=0.0,
+ use_cudnn=True,
+ act=None,
+ dtype="float32"):
+ std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
+ weight_init = fluid.initializer.NormalInitializer(scale=std)
+ weight_attr = fluid.ParamAttr(initializer=weight_init)
+ bias_init = fluid.initializer.ConstantInitializer(0.0)
+ bias_attr = fluid.ParamAttr(initializer=bias_init)
+ layer = conv.Conv1DTranspose(
+ name_scope,
+ in_channels,
+ num_filters,
+ filter_size,
+ padding=padding,
+ stride=stride,
+ dilation=dilation,
+ groups=groups,
+ param_attr=weight_attr,
+ bias_attr=bias_attr,
+ use_cudnn=use_cudnn,
+ act=act,
+ dtype=dtype)
+ return layer
+
+
+def compute_position_embedding(rad):
+ # rad is a transposed radius, shape(embed_dim, n_vocab)
+ embed_dim, n_vocab = rad.shape
+
+ even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
+ odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
+
+ even_rads = fluid.layers.gather(rad, even_dims)
+ odd_rads = fluid.layers.gather(rad, odd_dims)
+
+ sines = fluid.layers.sin(even_rads)
+ cosines = fluid.layers.cos(odd_rads)
+
+ temp = fluid.layers.scatter(rad, even_dims, sines)
+ out = fluid.layers.scatter(temp, odd_dims, cosines)
+ out = fluid.layers.transpose(out, perm=[1, 0])
+ return out
+
+
+def position_encoding_init(n_position,
+ d_pos_vec,
+ position_rate=1.0,
+ sinusoidal=True):
+ """ Init the sinusoid position encoding table """
+
+ # keep idx 0 for padding token position encoding zero vector
+ position_enc = np.array([[
+ position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
+ for i in range(d_pos_vec)
+ ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+
+ if sinusoidal:
+ position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
+ position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
+
+ return position_enc
+
+
+class PositionEmbedding(dg.Layer):
+ def __init__(self,
+ name_scope,
+ n_position,
+ d_pos_vec,
+ position_rate=1.0,
+ is_sparse=False,
+ is_distributed=False,
+ param_attr=None,
+ max_norm=None,
+ padding_idx=None,
+ dtype="float32"):
+ super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
+ self.embed = dg.Embedding(
+ self.full_name(),
+ size=(n_position, d_pos_vec),
+ is_sparse=is_sparse,
+ is_distributed=is_distributed,
+ padding_idx=None,
+ param_attr=param_attr,
+ dtype=dtype)
+ self.set_weight(
+ position_encoding_init(
+ n_position,
+ d_pos_vec,
+ position_rate=position_rate,
+ sinusoidal=False).astype(dtype))
+
+ self._is_sparse = is_sparse
+ self._is_distributed = is_distributed
+ self._remote_prefetch = self._is_sparse and (not self._is_distributed)
+ if self._remote_prefetch:
+ assert self._is_sparse is True and self._is_distributed is False
+
+ self._padding_idx = (-1 if padding_idx is None else padding_idx if
+ padding_idx >= 0 else (n_position + padding_idx))
+ self._position_rate = position_rate
+ self._max_norm = max_norm
+ self._dtype = dtype
+
+ def set_weight(self, array):
+ assert self.embed._w.shape == list(array.shape), "shape does not match"
+ self.embed._w._ivar.value().get_tensor().set(
+ array, fluid.framework._current_expected_place())
+
+ def forward(self, indices, speaker_position_rate=None):
+ """
+ Args:
+ indices (Variable): Shape (B, T, 1), dtype: int64, position
+ indices, where B means the batch size, T means the time steps.
+ speaker_position_rate (Variable | float, optional), position
+ rate. It can be a float point number or a Variable with
+ shape (1,), then this speaker_position_rate is used for every
+ example. It can also be a Variable with shape (B, 1), which
+ contains a speaker position rate for each speaker.
+ Returns:
+ out (Variable): Shape(B, C_pos), position embedding, where C_pos
+ means position embedding size.
+ """
+ rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
+ batch_size = indices.shape[0]
+
+ if speaker_position_rate is None:
+ weight = compute_position_embedding(rad)
+ out = self._helper.create_variable_for_type_inference(self._dtype)
+ self._helper.append_op(
+ type="lookup_table",
+ inputs={"Ids": indices,
+ "W": weight},
+ outputs={"Out": out},
+ attrs={
+ "is_sparse": self._is_sparse,
+ "is_distributed": self._is_distributed,
+ "remote_prefetch": self._remote_prefetch,
+ "padding_idx":
+ self._padding_idx, # special value for lookup table op
+ })
+ return out
+
+ elif (np.isscalar(speaker_position_rate) or
+ isinstance(speaker_position_rate, fluid.framework.Variable) and
+ speaker_position_rate.shape == [1, 1]):
+ # # make a weight
+ # scale the weight (the operand for sin & cos)
+ if np.isscalar(speaker_position_rate):
+ scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
+ else:
+ scaled_rad = fluid.layers.elementwise_mul(
+ rad, speaker_position_rate[0])
+ weight = compute_position_embedding(scaled_rad)
+ out = self._helper.create_variable_for_type_inference(self._dtype)
+ self._helper.append_op(
+ type="lookup_table",
+ inputs={"Ids": indices,
+ "W": weight},
+ outputs={"Out": out},
+ attrs={
+ "is_sparse": self._is_sparse,
+ "is_distributed": self._is_distributed,
+ "remote_prefetch": self._remote_prefetch,
+ "padding_idx":
+ self._padding_idx, # special value for lookup table op
+ })
+ return out
+
+ elif np.prod(speaker_position_rate.shape) > 1:
+ assert speaker_position_rate.shape == [batch_size, 1]
+ outputs = []
+ for i in range(batch_size):
+ rate = speaker_position_rate[i] # rate has shape [1]
+ scaled_rad = fluid.layers.elementwise_mul(rad, rate)
+ weight = compute_position_embedding(scaled_rad)
+ out = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ sequence = indices[i]
+ self._helper.append_op(
+ type="lookup_table",
+ inputs={"Ids": sequence,
+ "W": weight},
+ outputs={"Out": out},
+ attrs={
+ "is_sparse": self._is_sparse,
+ "is_distributed": self._is_distributed,
+ "remote_prefetch": self._remote_prefetch,
+ "padding_idx": -1,
+ })
+ outputs.append(out)
+ out = fluid.layers.stack(outputs)
+ return out
+ else:
+ raise Exception("Then you can just use position rate at init")
diff --git a/parakeet/modules/weight_norm.py b/parakeet/modules/weight_norm.py
new file mode 100644
index 0000000..cbb0d03
--- /dev/null
+++ b/parakeet/modules/weight_norm.py
@@ -0,0 +1,863 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+from six.moves import reduce
+
+from copy import deepcopy
+
+import paddle
+from paddle import fluid
+import paddle.fluid.dygraph as dg
+from paddle.fluid import core
+from paddle.fluid.layers import utils
+from paddle.fluid.framework import Variable
+from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
+
+
+def _norm(p, dim):
+ """Computes the norm over all dimensions except dim.
+ It differs from pytorch implementation that it does not keep dim.
+ This difference is related with the broadcast mechanism in paddle.
+ Read elementeise_mul for more.
+ """
+
+ if dim is None:
+ return np.linalg.norm(p, ord=2, axis=None)
+ elif dim == 0:
+ p = np.reshape(p, newshape=(p.shape[0], -1))
+ return np.linalg.norm(p, ord=2, axis=1)
+ elif dim == p.ndim - 1:
+ p = np.reshape(p, newshape=(-1, p.shape[-1]))
+ return np.linalg.norm(p, ord=2, axis=0)
+ else:
+ perm = list(range(p.ndim))
+ perm[0] = dim
+ perm[dim] = 0
+ return _norm(np.transpose(p, axes=perm))
+
+
+class FC(dg.Layer):
+ """
+ **Fully Connected Layer**
+
+ This function creates a fully connected layer in the network. It can take
+ one or multiple tensors as its inputs(input can be a list of Variable, see
+ Args in detail). It creates a pair of variables called (magnitude(g),
+ direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected
+ weight matrix from each input unit to each output unit.
+ The fully connected layer multiplies each input tensor
+ with its corresponding weight to produce an output Tensor with shape [M, `size`],
+ where M is batch size. If multiple input tensors are given, the results of
+ multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+ is not None, a bias variable will be created and added to the output.
+ Finally, if activation is not None, it will be applied to the output as well.
+
+ When the input is single tensor:
+
+ .. math::
+
+ Out = Act({X(normalize(V)g) + b})
+
+ When the input are multiple tensors:
+
+ .. math::
+
+ Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
+
+ In the above equation:
+
+ * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+ * :math:`X_i`: The i-th input tensor.
+ * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
+ * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
+ * :math:`b`: The bias parameter created by this layer (if needed).
+ * :math:`Act`: The activation function.
+ * :math:`Out`: The output tensor.
+
+ See below for an example.
+
+ .. code-block:: text
+
+ Given:
+ data_1.data = [[[0.1, 0.2],
+ [0.3, 0.4]]]
+ data_1.shape = (1, 2, 2) # 1 is batch_size
+
+ data_2 = [[[0.1, 0.2, 0.3]]]
+ data_2.shape = (1, 1, 3)
+
+ out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+ Then:
+ out.data = [[0.18669507, 0.1893476]]
+ out.shape = (1, 2)
+
+ Args:
+ name_scope(str): The name of this class.
+ size(int): The number of output units in this layer.
+ num_flatten_dims (int): The fc layer can accept an input tensor with more than
+ two dimensions. If this happens, the multidimensional tensor will first be flattened
+ into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+ tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+ dimensions will be flatten to form the first dimension of the final matrix (height of
+ the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+ form the second dimension of the final matrix (width of the matrix). For example, suppose
+ `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+ Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
+ param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
+ parameters/weights of this layer.
+ bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+ of this layer. If it is set to False, no bias will be added to the output units.
+ If it is set to None, the bias is initialized zero. Default: None.
+ act (str|None): Activation to be applied to the output of this layer.
+ is_test(bool): A flag indicating whether execution is in test phase. Default: False
+ dtype(str): Dtype used for weight
+
+ Raises:
+ ValueError: If rank of the input tensor is less than 2.
+
+ Examples:
+ .. code-block:: python
+
+ from paddle.fluid.dygraph.base import to_variable
+ import paddle.fluid as fluid
+ from paddle.fluid.dygraph import FC
+ import numpy as np
+
+ data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
+ with fluid.dygraph.guard():
+ fc = FC( "fc", 64, num_flatten_dims=2)
+ data = to_variable( data )
+ conv = fc( data )
+
+ """
+
+ def __init__(self,
+ name_scope,
+ size,
+ num_flatten_dims=1,
+ epsilon=1e-30,
+ param_attr=None,
+ bias_attr=None,
+ act=None,
+ is_test=False,
+ dtype="float32"):
+ super(FC, self).__init__(name_scope, dtype)
+
+ self._size = size
+ self._num_flatten_dims = num_flatten_dims
+ self._epsilon = epsilon
+ self._dtype = dtype
+ self._param_attr = param_attr
+ self._bias_attr = bias_attr
+ self._act = act
+ self.__g = list()
+ self.__v = list()
+
+ @property
+ def _v(self, i=0):
+ return self.__v[i]
+
+ @property
+ def _g(self, i=0):
+ return self.__g[i]
+
+ @_v.setter
+ def _v(self, value, i=0):
+ assert isinstance(value, Parameter)
+ self.__v[i] = value
+
+ @_g.setter
+ def _g(self, value, i=0):
+ assert isinstance(value, Parameter)
+ self.__g[i] = value
+
+ def _build_once(self, input):
+ i = 0
+ for inp, param in self._helper.iter_inputs_and_params(input,
+ self._param_attr):
+ input_shape = inp.shape
+
+ param_shape = [
+ reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+ 1)
+ ] + [self._size]
+ self.__v.append(
+ self.add_parameter(
+ "_v%d" % i,
+ self.create_parameter(
+ attr=param,
+ shape=param_shape,
+ dtype=self._dtype,
+ is_bias=False)))
+
+ magnitude_shape = param_shape[1:]
+ magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0)
+
+ self.__g.append(
+ self.add_parameter(
+ "_g%d" % i,
+ self.create_parameter(
+ attr=fluid.ParamAttr(
+ initializer=fluid.initializer.NumpyArrayInitializer(
+ magnitude_value)),
+ shape=magnitude_shape,
+ dtype=self._dtype,
+ is_bias=False)))
+ i += 1
+
+ size = list([self._size])
+ self._b = self.create_parameter(
+ attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+
+ def forward(self, input):
+ mul_results = list()
+ i = 0
+ for inp, param in self._helper.iter_inputs_and_params(input,
+ self._param_attr):
+ v_norm = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ v_normalized = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="norm",
+ inputs={"X": self.__v[i]},
+ outputs={"Out": v_normalized,
+ "Norm": v_norm},
+ attrs={"axis": 0,
+ "epsilon": self._epsilon})
+ weight = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="elementwise_mul",
+ inputs={"X": [v_normalized],
+ "Y": [self.__g[i]]},
+ outputs={"Out": [weight]},
+ attrs={"axis": 1})
+ tmp = self._helper.create_variable_for_type_inference(self._dtype)
+ self._helper.append_op(
+ type="mul",
+ inputs={"X": inp,
+ "Y": weight},
+ outputs={"Out": tmp},
+ attrs={
+ "x_num_col_dims": self._num_flatten_dims,
+ "y_num_col_dims": 1
+ })
+ i += 1
+ mul_results.append(tmp)
+
+ if len(mul_results) == 1:
+ pre_bias = mul_results[0]
+ else:
+ pre_bias = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="sum",
+ inputs={"X": mul_results},
+ outputs={"Out": pre_bias},
+ attrs={"use_mkldnn": False})
+
+ if self._b:
+ pre_activation = self._helper.create_variable_for_type_inference(
+ dtype=self._dtype)
+ self._helper.append_op(
+ type="elementwise_add",
+ inputs={"X": [pre_bias],
+ "Y": [self._b]},
+ outputs={"Out": [pre_activation]},
+ attrs={"axis": self._num_flatten_dims})
+ else:
+ pre_activation = pre_bias
+ # Currently, we don't support inplace in dygraph mode
+ return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class Conv2D(dg.Layer):
+ """
+ The convolution2D layer calculates the output based on the input, filter
+ and strides, paddings, dilations, groups parameters. Input and
+ Output are in NCHW format, where N is batch size, C is the number of
+ channels, H is the height of the feature, and W is the width of the feature.
+ Filter is in MCHW format, where M is the number of output image channels,
+ C is the number of input image channels, H is the height of the filter,
+ and W is the width of the filter. If the groups is greater than 1,
+ C will equal the number of input image channels divided by the groups.
+ Please refer to UFLDL's `convolution
+ `
+ for more detials.
+ If bias attribution and activation type are provided, bias is added to the
+ output of the convolution, and the corresponding activation function is
+ applied to the final result.
+
+ For each input :math:`X`, the equation is:
+
+ .. math::
+
+ Out = \sigma ((Vg) \\ast X + b)
+
+ Where:
+
+ * :math:`X`: Input value, a tensor with NCHW format.
+ * :math:`V`: Filter direction value, a tensor with MCHW format.
+ * :math:`g`: Filter magnitude value, a tensor with M format.
+ * :math:`\\ast`: Convolution operation.
+ * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+ * :math:`\\sigma`: Activation function.
+ * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+ Example:
+
+ - Input:
+
+ Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+ Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+ - Output:
+
+ Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+ Where
+
+ .. math::
+
+ H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+ W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+ Args:
+ name_scope(str) : The name for this class.
+ num_filters(int): The number of filter. It is as same as the output
+ image channel.
+ filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+ it must contain two integers, (filter_size_H, filter_size_W).
+ Otherwise, the filter will be a square.
+ stride (int|tuple): The stride size. If stride is a tuple, it must
+ contain two integers, (stride_H, stride_W). Otherwise, the
+ stride_H = stride_W = stride. Default: stride = 1.
+ padding (int|tuple): The padding size. If padding is a tuple, it must
+ contain two integers, (padding_H, padding_W). Otherwise, the
+ padding_H = padding_W = padding. Default: padding = 0.
+ dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+ contain two integers, (dilation_H, dilation_W). Otherwise, the
+ dilation_H = dilation_W = dilation. Default: dilation = 1.
+ groups (int): The groups number of the Conv2d Layer. According to grouped
+ convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+ the first half of the filters is only connected to the first half
+ of the input channels, while the second half of the filters is only
+ connected to the second half of the input channels. Default: groups=1.
+ param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+ of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+ will create ParamAttr as param_attr. If the Initializer of the param_attr
+ is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+ and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+ bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+ If it is set to False, no bias will be added to the output units.
+ If it is set to None or one attribute of ParamAttr, conv2d
+ will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+ is not set, the bias is initialized zero. Default: None.
+ use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+ library is installed. Default: True
+ act (str): Activation type, if it is set to None, activation is not appended.
+ Default: None
+
+ Raises:
+ ValueError: If the shapes of input, filter_size, stride, padding and
+ groups mismatch.
+
+ Examples:
+ .. code-block:: python
+
+ from paddle.fluid.dygraph.base import to_variable
+ import paddle.fluid as fluid
+ from paddle.fluid.dygraph import Conv2D
+ import numpy as np
+
+ data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
+ with fluid.dygraph.guard():
+ conv2d = Conv2D( "conv2d", 2, 3)
+ data = to_variable( data )
+ conv = conv2d( data )
+
+ """
+
+ def __init__(self,
+ name_scope,
+ num_filters,
+ filter_size,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=None,
+ param_attr=None,
+ bias_attr=None,
+ use_cudnn=True,
+ act=None,
+ epsilon=1e-30,
+ dtype="float32"):
+ assert param_attr is not False, "param_attr should not be False here."
+ super(Conv2D, self).__init__(name_scope, dtype)
+ self._groups = groups
+ self._stride = utils.convert_to_list(stride, 2, "stride")
+ self._padding = utils.convert_to_list(padding, 2, "padding")
+ self._dilation = utils.convert_to_list(dilation, 2, "dilation")
+ self._act = act
+ if not isinstance(use_cudnn, bool):
+ raise ValueError("use_cudnn should be True or False")
+ self._use_cudnn = use_cudnn
+ self._filter_size = filter_size
+ self._num_filters = num_filters
+ self._param_attr = param_attr
+ self._bias_attr = bias_attr
+ self._epsilon = epsilon
+ self._dtype = dtype
+ # if (self._num_channels == self._groups and
+ # num_filters % self._num_channels == 0 and not self._use_cudnn):
+ # self._l_type = 'depthwise_conv2d'
+ # else:
+ # TODO(jiabin): recover the usage of depthwise_conv2d when it's
+ # kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
+ self._l_type = "conv2d"
+
+ def _build_once(self, input):
+ self._num_channels = input.shape[1]
+ if self._groups is None:
+ num_filter_channels = self._num_channels
+ else:
+ if self._num_channels % self._groups != 0:
+ raise ValueError("num_channels must be divisible by groups.")
+ num_filter_channels = self._num_channels // self._groups
+ filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size")
+ filter_shape = [self._num_filters, int(num_filter_channels)
+ ] + filter_size
+
+ def _get_default_param_initializer():
+ filter_elem_num = filter_size[0] * filter_size[
+ 1] * self._num_channels
+ std = (2.0 / filter_elem_num)**0.5
+ return Normal(0.0, std, 0)
+
+ # weight_v
+ self._filter_param_v = self.create_parameter(
+ attr=self._param_attr,
+ shape=filter_shape,
+ dtype=self._dtype,
+ default_initializer=_get_default_param_initializer())
+
+ # weight_g
+ norm_value = _norm(
+ self._filter_param_v.numpy(), dim=0) # CAUTION: hard-code
+ self._filter_param_g = self.create_parameter(
+ attr=fluid.ParamAttr(
+ initializer=fluid.initializer.NumpyArrayInitializer(
+ norm_value)),
+ shape=norm_value.shape,
+ dtype=self._dtype,
+ default_initializer=_get_default_param_initializer())
+
+ if self._use_cudnn:
+ self.create_variable(
+ name="kCUDNNFwdAlgoCache",
+ persistable=True,
+ type=core.VarDesc.VarType.RAW)
+ self.create_variable(
+ name="kCUDNNBwdDataAlgoCache",
+ persistable=True,
+ type=core.VarDesc.VarType.RAW)
+ self.create_variable(
+ name="kCUDNNBwdFilterAlgoCache",
+ persistable=True,
+ type=core.VarDesc.VarType.RAW)
+
+ self._bias_param = self.create_parameter(
+ attr=self._bias_attr,
+ shape=[self._num_filters],
+ dtype=self._dtype,
+ is_bias=True)
+
+ def forward(self, input):
+ matrix = self._helper.create_variable_for_type_inference(self._dtype)
+ tmp = self._helper.create_variable_for_type_inference(self._dtype)
+ new_shape = [
+ self._filter_param_v.shape[0],
+ reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
+ ]
+
+ self._helper.append_op(
+ type="reshape2",
+ inputs={"X": self._filter_param_v},
+ attrs={"shape": new_shape},
+ outputs={"Out": matrix,
+ "XShape": tmp})
+
+ m_norm = self._helper.create_variable_for_type_inference(self._dtype)
+ m_normalized = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="norm",
+ inputs={"X": matrix},
+ outputs={"Out": m_normalized,
+ "Norm": m_norm},
+ attrs={"axis": 1,
+ "epsilon": self._epsilon})
+
+ v_normalized = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
+ self._helper.append_op(
+ type="reshape2",
+ inputs={"X": m_normalized},
+ attrs={"shape": self._filter_param_v.shape},
+ outputs={"Out": v_normalized,
+ "XShape": tmp2})
+
+ filter_param = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="elementwise_mul",
+ inputs={"X": [v_normalized],
+ "Y": [self._filter_param_g]},
+ outputs={"Out": [filter_param]},
+ attrs={"axis": 0}, # CAUTION: hard-code
+ )
+
+ pre_bias = self._helper.create_variable_for_type_inference(
+ dtype=self._dtype)
+
+ self._helper.append_op(
+ type=self._l_type,
+ inputs={"Input": input,
+ "Filter": filter_param},
+ outputs={"Output": pre_bias},
+ attrs={
+ "strides": self._stride,
+ "paddings": self._padding,
+ "dilations": self._dilation,
+ "groups": self._groups if self._groups else 1,
+ "use_cudnn": self._use_cudnn,
+ "use_mkldnn": False,
+ })
+
+ if self._bias_param is not None:
+ pre_act = self._helper.create_variable_for_type_inference(
+ dtype=self._dtype)
+ self._helper.append_op(
+ type="elementwise_add",
+ inputs={"X": [pre_bias],
+ "Y": [self._bias_param]},
+ outputs={"Out": [pre_act]},
+ attrs={"axis": 1})
+ else:
+ pre_act = pre_bias
+
+ # Currently, we don't support inplace in dygraph mode
+ return self._helper.append_activation(pre_act, act=self._act)
+
+
+class Conv2DTranspose(dg.Layer):
+ """
+ **Convlution2D transpose layer**
+
+ The convolution2D transpose layer calculates the output based on the input,
+ filter, and dilations, strides, paddings. Input(Input) and output(Output)
+ are in NCHW format. Where N is batch size, C is the number of channels,
+ H is the height of the feature, and W is the width of the feature.
+ Parameters(dilations, strides, paddings) are two elements. These two elements
+ represent height and width, respectively. The details of convolution transpose
+ layer, please refer to the following explanation and references
+ `therein `_.
+ If bias attribution and activation type are provided, bias is added to
+ the output of the convolution, and the corresponding activation function
+ is applied to the final result.
+
+ For each input :math:`X`, the equation is:
+
+ .. math::
+
+ Out = \sigma ((Vg) \\ast X + b)
+
+ Where:
+
+ * :math:`X`: Input value, a tensor with NCHW format.
+ * :math:`V`: Filter value, a tensor with MCHW format.
+ * :math:`g`: Filter value, a tensor with M format.
+ * :math:`\\ast`: Convolution operation.
+ * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+ * :math:`\\sigma`: Activation function.
+ * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+ Example:
+
+ - Input:
+
+ Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+ Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
+
+ - Output:
+
+ Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+ Where
+
+ .. math::
+
+ H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
+ W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
+ H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+ W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+
+ Args:
+ name_scope(str): The name of this class.
+ num_filters(int): The number of the filter. It is as same as the output
+ image channel.
+ output_size(int|tuple|None): The output image size. If output size is a
+ tuple, it must contain two integers, (image_H, image_W). None if use
+ filter_size, padding, and stride to calculate output_size.
+ if output_size and filter_size are specified at the same time, They
+ should follow the formula above. Default: None.
+ filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+ it must contain two integers, (filter_size_H, filter_size_W).
+ Otherwise, the filter will be a square. None if use output size to
+ calculate filter_size. Default: None.
+ padding(int|tuple): The padding size. If padding is a tuple, it must
+ contain two integers, (padding_H, padding_W). Otherwise, the
+ padding_H = padding_W = padding. Default: padding = 0.
+ stride(int|tuple): The stride size. If stride is a tuple, it must
+ contain two integers, (stride_H, stride_W). Otherwise, the
+ stride_H = stride_W = stride. Default: stride = 1.
+ dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+ contain two integers, (dilation_H, dilation_W). Otherwise, the
+ dilation_H = dilation_W = dilation. Default: dilation = 1.
+ groups(int): The groups number of the Conv2d transpose layer. Inspired by
+ grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+ when group=2, the first half of the filters is only connected to the
+ first half of the input channels, while the second half of the
+ filters is only connected to the second half of the input channels.
+ Default: groups = 1.
+ param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+ of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
+ will create ParamAttr as param_attr. If the Initializer of the param_attr
+ is not set, the parameter is initialized with Xavier. Default: None.
+ bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
+ If it is set to False, no bias will be added to the output units.
+ If it is set to None or one attribute of ParamAttr, conv2d_transpose
+ will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+ is not set, the bias is initialized zero. Default: None.
+ use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+ library is installed. Default: True.
+ act (str): Activation type, if it is set to None, activation is not appended.
+ Default: None.
+
+ Returns:
+ Variable: The tensor variable storing the convolution transpose result.
+
+ Raises:
+ ValueError: If the shapes of input, filter_size, stride, padding and
+ groups mismatch.
+
+ Examples:
+ .. code-block:: python
+
+ import paddle.fluid as fluid
+ import numpy
+
+ with fluid.dygraph.guard():
+ data = numpy.random.random((3, 32, 32)).astype('float32')
+ conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
+ 'Conv2DTranspose', num_filters=2, filter_size=3)
+ ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
+
+ """
+
+ def __init__(self,
+ name_scope,
+ num_filters,
+ output_size=None,
+ filter_size=None,
+ padding=0,
+ stride=1,
+ dilation=1,
+ groups=None,
+ param_attr=None,
+ bias_attr=None,
+ use_cudnn=True,
+ epsilon=1e-30,
+ act=None,
+ dtype="float32"):
+ super(Conv2DTranspose, self).__init__(name_scope, dtype)
+ assert (param_attr is not False
+ ), "param_attr should not be False in conv2d_transpose."
+ self._param_attr = param_attr
+ self._bias_attr = bias_attr
+ self._groups = groups
+ self._num_filters = num_filters
+ self._use_cudnn = use_cudnn
+ self._padding = padding
+ self._stride = stride
+ self._dilation = dilation
+ self._filter_size = filter_size
+ self._output_size = output_size
+ self._op_type = "conv2d_transpose"
+ self._epsilon = epsilon
+
+ def _build_once(self, input):
+ input_channel = input.shape[1]
+ if (input_channel == self._groups and
+ self._num_filters == input_channel and not self._use_cudnn):
+ self._op_type = "depthwise_conv2d_transpose"
+
+ if not isinstance(input, Variable):
+ raise TypeError("Input of conv2d_transpose must be Variable")
+
+ self._padding = utils.convert_to_list(self._padding, 2, "padding")
+ self._stride = utils.convert_to_list(self._stride, 2, "stride")
+ self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
+
+ if not isinstance(self._use_cudnn, bool):
+ raise ValueError("use_cudnn should be True or False")
+
+ if self._filter_size is None:
+ if self._output_size is None:
+ raise ValueError(
+ "output_size must be set when filter_size is None")
+ if isinstance(self._output_size, int):
+ self._output_size = [self._output_size, self._output_size]
+
+ h_in = input.shape[2]
+ w_in = input.shape[3]
+
+ filter_size_h = (self._output_size[0] -
+ (h_in - 1) * self._stride[0] + 2 * self._padding[0]
+ - 1) // self._dilation[0] + 1
+ filter_size_w = (self._output_size[1] -
+ (w_in - 1) * self._stride[1] + 2 * self._padding[1]
+ - 1) // self._dilation[1] + 1
+ self._filter_size = [filter_size_h, filter_size_w]
+ else:
+ self._filter_size = utils.convert_to_list(
+ self._filter_size, 2, "conv2d_transpose.filter_size")
+
+ if self._output_size is None:
+ self._output_size = []
+ elif isinstance(self._output_size, list) or isinstance(
+ self._output_size, int):
+ self._output_size = utils.convert_to_list(self._output_size, 2,
+ "output_size")
+ else:
+ raise ValueError("output_size should be list or int")
+ self._padding = utils.convert_to_list(self._padding, 2, "padding")
+ self._groups = 1 if self._groups is None else self._groups
+ filter_shape = [
+ input_channel,
+ self._num_filters // self._groups,
+ ] + self._filter_size
+
+ # img filter v (direction)
+ self._img_filter_v = self.create_parameter(
+ dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
+
+ # img filter g (magnitude)
+ img_filter_magnitude = _norm(
+ self._img_filter_v.numpy(), dim=0) # CAUTION: hard-code
+ self._img_filter_g = self.create_parameter(
+ dtype=input.dtype,
+ shape=img_filter_magnitude.shape,
+ attr=fluid.ParamAttr(
+ initializer=NumpyArrayInitializer(img_filter_magnitude)))
+
+ self._img_bias = self.create_parameter(
+ attr=self._bias_attr,
+ shape=[self._num_filters],
+ dtype=self._dtype,
+ is_bias=True)
+
+ def forward(self, input):
+ matrix = self._helper.create_variable_for_type_inference(self._dtype)
+ tmp = self._helper.create_variable_for_type_inference(self._dtype)
+ new_shape = [
+ self._img_filter_v.shape[0],
+ reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
+ ]
+
+ self._helper.append_op(
+ type="reshape2",
+ inputs={"X": self._img_filter_v},
+ attrs={"shape": new_shape},
+ outputs={"Out": matrix,
+ "XShape": tmp})
+
+ m_norm = self._helper.create_variable_for_type_inference(self._dtype)
+ m_normalized = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="norm",
+ inputs={"X": matrix},
+ outputs={"Out": m_normalized,
+ "Norm": m_norm},
+ attrs={"axis": 1,
+ "epsilon": self._epsilon})
+
+ v_normalized = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
+ self._helper.append_op(
+ type="reshape2",
+ inputs={"X": m_normalized},
+ attrs={"shape": self._img_filter_v.shape},
+ outputs={"Out": v_normalized,
+ "XShape": tmp2})
+
+ img_filter = self._helper.create_variable_for_type_inference(
+ self._dtype)
+ self._helper.append_op(
+ type="elementwise_mul",
+ inputs={"X": [v_normalized],
+ "Y": [self._img_filter_g]},
+ outputs={"Out": [img_filter]},
+ attrs={"axis": 0}, # CAUTION: hard-code
+ )
+
+ pre_bias = self._helper.create_variable_for_type_inference(
+ dtype=input.dtype)
+ self._helper.append_op(
+ type=self._op_type,
+ inputs={"Input": [input],
+ "Filter": [img_filter]},
+ outputs={"Output": pre_bias},
+ attrs={
+ "output_size": self._output_size,
+ "strides": self._stride,
+ "paddings": self._padding,
+ "dilations": self._dilation,
+ "groups": self._groups,
+ "use_cudnn": self._use_cudnn,
+ })
+
+ if self._img_bias is not None:
+ pre_act = self._helper.create_variable_for_type_inference(
+ dtype=self._dtype)
+ self._helper.append_op(
+ type="elementwise_add",
+ inputs={"X": [pre_bias],
+ "Y": [self._img_bias]},
+ outputs={"Out": [pre_act]},
+ attrs={"axis": 1})
+ else:
+ pre_act = pre_bias
+
+ out = self._helper.append_activation(pre_act)
+ return out