From de4c5d4f49946e6f69a5e40671c2f203eb2849b6 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 25 Nov 2019 03:40:52 +0000 Subject: [PATCH] add models & modules back --- parakeet/{ => models}/deepvoice3/README.md | 0 parakeet/{ => models}/deepvoice3/README_cn.md | 0 parakeet/{ => models}/deepvoice3/_ce.py | 0 .../deepvoice3/_images/model_architecture.png | Bin parakeet/{ => models}/deepvoice3/audio.py | 0 parakeet/{ => models}/deepvoice3/builder.py | 0 .../deepvoice3/compute_timestamp_ratio.py | 0 parakeet/{ => models}/deepvoice3/data.py | 0 .../{ => models}/deepvoice3/deepvoice3.py | 0 parakeet/{ => models}/deepvoice3/dry_run.py | 0 .../{ => models}/deepvoice3/eval_model.py | 0 parakeet/{ => models}/deepvoice3/hparams.py | 0 parakeet/{ => models}/deepvoice3/ljspeech.py | 0 .../{ => models}/deepvoice3/preprocess.py | 0 .../presets/deepvoice3_ljspeech.json | 0 parakeet/{ => models}/deepvoice3/synthesis.py | 0 parakeet/{ => models}/deepvoice3/train.py | 0 parakeet/{ => models}/deepvoice3/train.sh | 0 .../{ => models}/deepvoice3/train_model.py | 0 parakeet/modules/__init__.py | 0 parakeet/modules/conv.py | 222 +++++ parakeet/modules/loss.py | 158 ++++ parakeet/modules/modules.py | 458 ++++++++++ parakeet/modules/weight_norm.py | 863 ++++++++++++++++++ 24 files changed, 1701 insertions(+) rename parakeet/{ => models}/deepvoice3/README.md (100%) rename parakeet/{ => models}/deepvoice3/README_cn.md (100%) rename parakeet/{ => models}/deepvoice3/_ce.py (100%) rename parakeet/{ => models}/deepvoice3/_images/model_architecture.png (100%) rename parakeet/{ => models}/deepvoice3/audio.py (100%) rename parakeet/{ => models}/deepvoice3/builder.py (100%) rename parakeet/{ => models}/deepvoice3/compute_timestamp_ratio.py (100%) rename parakeet/{ => models}/deepvoice3/data.py (100%) rename parakeet/{ => models}/deepvoice3/deepvoice3.py (100%) rename parakeet/{ => models}/deepvoice3/dry_run.py (100%) rename parakeet/{ => models}/deepvoice3/eval_model.py (100%) rename parakeet/{ => models}/deepvoice3/hparams.py (100%) rename parakeet/{ => models}/deepvoice3/ljspeech.py (100%) rename parakeet/{ => models}/deepvoice3/preprocess.py (100%) rename parakeet/{ => models}/deepvoice3/presets/deepvoice3_ljspeech.json (100%) rename parakeet/{ => models}/deepvoice3/synthesis.py (100%) rename parakeet/{ => models}/deepvoice3/train.py (100%) rename parakeet/{ => models}/deepvoice3/train.sh (100%) rename parakeet/{ => models}/deepvoice3/train_model.py (100%) create mode 100644 parakeet/modules/__init__.py create mode 100644 parakeet/modules/conv.py create mode 100644 parakeet/modules/loss.py create mode 100644 parakeet/modules/modules.py create mode 100644 parakeet/modules/weight_norm.py diff --git a/parakeet/deepvoice3/README.md b/parakeet/models/deepvoice3/README.md similarity index 100% rename from parakeet/deepvoice3/README.md rename to parakeet/models/deepvoice3/README.md diff --git a/parakeet/deepvoice3/README_cn.md b/parakeet/models/deepvoice3/README_cn.md similarity index 100% rename from parakeet/deepvoice3/README_cn.md rename to parakeet/models/deepvoice3/README_cn.md diff --git a/parakeet/deepvoice3/_ce.py b/parakeet/models/deepvoice3/_ce.py similarity index 100% rename from parakeet/deepvoice3/_ce.py rename to parakeet/models/deepvoice3/_ce.py diff --git a/parakeet/deepvoice3/_images/model_architecture.png b/parakeet/models/deepvoice3/_images/model_architecture.png similarity index 100% rename from parakeet/deepvoice3/_images/model_architecture.png rename to parakeet/models/deepvoice3/_images/model_architecture.png diff --git a/parakeet/deepvoice3/audio.py b/parakeet/models/deepvoice3/audio.py similarity index 100% rename from parakeet/deepvoice3/audio.py rename to parakeet/models/deepvoice3/audio.py diff --git a/parakeet/deepvoice3/builder.py b/parakeet/models/deepvoice3/builder.py similarity index 100% rename from parakeet/deepvoice3/builder.py rename to parakeet/models/deepvoice3/builder.py diff --git a/parakeet/deepvoice3/compute_timestamp_ratio.py b/parakeet/models/deepvoice3/compute_timestamp_ratio.py similarity index 100% rename from parakeet/deepvoice3/compute_timestamp_ratio.py rename to parakeet/models/deepvoice3/compute_timestamp_ratio.py diff --git a/parakeet/deepvoice3/data.py b/parakeet/models/deepvoice3/data.py similarity index 100% rename from parakeet/deepvoice3/data.py rename to parakeet/models/deepvoice3/data.py diff --git a/parakeet/deepvoice3/deepvoice3.py b/parakeet/models/deepvoice3/deepvoice3.py similarity index 100% rename from parakeet/deepvoice3/deepvoice3.py rename to parakeet/models/deepvoice3/deepvoice3.py diff --git a/parakeet/deepvoice3/dry_run.py b/parakeet/models/deepvoice3/dry_run.py similarity index 100% rename from parakeet/deepvoice3/dry_run.py rename to parakeet/models/deepvoice3/dry_run.py diff --git a/parakeet/deepvoice3/eval_model.py b/parakeet/models/deepvoice3/eval_model.py similarity index 100% rename from parakeet/deepvoice3/eval_model.py rename to parakeet/models/deepvoice3/eval_model.py diff --git a/parakeet/deepvoice3/hparams.py b/parakeet/models/deepvoice3/hparams.py similarity index 100% rename from parakeet/deepvoice3/hparams.py rename to parakeet/models/deepvoice3/hparams.py diff --git a/parakeet/deepvoice3/ljspeech.py b/parakeet/models/deepvoice3/ljspeech.py similarity index 100% rename from parakeet/deepvoice3/ljspeech.py rename to parakeet/models/deepvoice3/ljspeech.py diff --git a/parakeet/deepvoice3/preprocess.py b/parakeet/models/deepvoice3/preprocess.py similarity index 100% rename from parakeet/deepvoice3/preprocess.py rename to parakeet/models/deepvoice3/preprocess.py diff --git a/parakeet/deepvoice3/presets/deepvoice3_ljspeech.json b/parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json similarity index 100% rename from parakeet/deepvoice3/presets/deepvoice3_ljspeech.json rename to parakeet/models/deepvoice3/presets/deepvoice3_ljspeech.json diff --git a/parakeet/deepvoice3/synthesis.py b/parakeet/models/deepvoice3/synthesis.py similarity index 100% rename from parakeet/deepvoice3/synthesis.py rename to parakeet/models/deepvoice3/synthesis.py diff --git a/parakeet/deepvoice3/train.py b/parakeet/models/deepvoice3/train.py similarity index 100% rename from parakeet/deepvoice3/train.py rename to parakeet/models/deepvoice3/train.py diff --git a/parakeet/deepvoice3/train.sh b/parakeet/models/deepvoice3/train.sh similarity index 100% rename from parakeet/deepvoice3/train.sh rename to parakeet/models/deepvoice3/train.sh diff --git a/parakeet/deepvoice3/train_model.py b/parakeet/models/deepvoice3/train_model.py similarity index 100% rename from parakeet/deepvoice3/train_model.py rename to parakeet/models/deepvoice3/train_model.py diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py new file mode 100644 index 0000000..34149be --- /dev/null +++ b/parakeet/modules/conv.py @@ -0,0 +1,222 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + +from weight_norm import Conv2D, Conv2DTranspose + + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + name_scope, + in_cahnnels, + num_filters, + filter_size=3, + dilation=1, + groups=None, + causal=False, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + dtype="float32"): + super(Conv1D, self).__init__(name_scope, dtype=dtype) + + if causal: + padding = dilation * (filter_size - 1) + else: + padding = (dilation * (filter_size - 1)) // 2 + + self.in_channels = in_cahnnels + self.num_filters = num_filters + self.filter_size = filter_size + self.dilation = dilation + self.causal = causal + self.padding = padding + self.act = act + + self.conv = Conv2D( + self.full_name(), + num_filters=num_filters, + filter_size=(1, filter_size), + stride=(1, 1), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + x = self.conv(x) + if self.filter_size > 1: + if self.causal: + x = fluid.layers.slice( + x, axes=[3], starts=[0], ends=[-self.padding]) + elif self.filter_size % 2 == 0: + x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1]) + return x + + def start_new_sequence(self): + self.temp_weight = None + self.input_buffer = None + + def add_input(self, x): + """ + Adding input for a time step and compute an output for a time step. + + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels, and T = 1. + + Returns: + out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out + means output channels (num_filters), and T = 1. + + """ + if self.temp_weight is None: + self.temp_weight = self._reshaped_weight() + + window_size = 1 + (self.filter_size - 1) * self.dilation + batch_size = x.shape[0] + in_channels = x.shape[1] + + if self.filter_size > 1: + if self.input_buffer is None: + self.input_buffer = fluid.layers.fill_constant( + [batch_size, in_channels, 1, window_size - 1], + dtype=x.dtype, + value=0.0) + else: + self.input_buffer = self.input_buffer[:, :, :, 1:] + self.input_buffer = fluid.layers.concat( + [self.input_buffer, x], axis=3) + x = self.input_buffer + if self.dilation > 1: + if not hasattr(self, "indices"): + self.indices = dg.to_variable( + np.arange(0, window_size, self.dilation)) + tmp = fluid.layers.transpose( + self.input_buffer, perm=[3, 1, 2, 0]) + tmp = fluid.layers.gather(tmp, index=self.indices) + tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0]) + x = tmp + inputs = fluid.layers.reshape( + x, shape=[batch_size, in_channels * 1 * self.filter_size]) + out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True) + out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1) + out = fluid.layers.reshape(out, out.shape + [1, 1]) + out = self._helper.append_activation(out, act=self.act) + return out + + def _reshaped_weight(self): + """ + Get the linearized weight of convolution filter, cause it is by nature + a matmul weight. And because the model uses weight norm, compute the + weight by weight_v * weight_g to make it faster. + + Returns: + weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size) + """ + shape = self.conv._filter_param_v.shape + matrix_shape = [shape[0], np.prod(shape[1:])] + weight_matrix = fluid.layers.reshape( + self.conv._filter_param_v, shape=matrix_shape) + weight_matrix = fluid.layers.elementwise_mul( + fluid.layers.l2_normalize( + weight_matrix, axis=1), + self.conv._filter_param_g, + axis=0) + return weight_matrix + + +class Conv1DTranspose(dg.Layer): + """ + A convolutional transpose 1D block implemented with convolutional transpose + 2D. It does not ensure that the output is exactly expanded stride times in + time dimension. + """ + + def __init__(self, + name_scope, + in_channels, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + dtype="float32"): + super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype) + + self.in_channels = in_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.padding = padding + self.stride = stride + self.dilation = dilation + self.groups = groups + + self.conv_transpose = Conv2DTranspose( + self.full_name(), + num_filters, + filter_size=(1, filter_size), + padding=(0, padding), + stride=(1, stride), + dilation=(1, dilation), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Argss: + x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input + channels and T_in means the number of time steps of input. + + Returns: + out (Variable): shape(B, C_out, 1, T_out), where C_out means the + output channels and T_out means the number of time steps of + input. + """ + return self.conv_transpose(x) diff --git a/parakeet/modules/loss.py b/parakeet/modules/loss.py new file mode 100644 index 0000000..96bcd3b --- /dev/null +++ b/parakeet/modules/loss.py @@ -0,0 +1,158 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from numba import jit + +from paddle import fluid +import paddle.fluid.dygraph as dg + + +def masked_mean(inputs, mask): + """ + Args: + inputs (Variable): Shape(B, C, 1, T), the input, where B means + batch size, C means channels of input, T means timesteps of + the input. + mask (Variable): Shape(B, T), a mask. + Returns: + loss (Variable): Shape(1, ), masked mean. + """ + channels = inputs.shape[1] + reshaped_mask = fluid.layers.reshape( + mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]]) + expanded_mask = fluid.layers.expand( + reshaped_mask, expand_times=[1, channels, 1, 1]) + expanded_mask.stop_gradient = True + + valid_cnt = fluid.layers.reduce_sum(expanded_mask) + valid_cnt.stop_gradient = True + + masked_inputs = inputs * expanded_mask + loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt + return loss + + +@jit(nopython=True) +def guided_attention(N, max_N, T, max_T, g): + W = np.zeros((max_N, max_T), dtype=np.float32) + for n in range(N): + for t in range(T): + W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g)) + return W + + +def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2): + B = len(input_lengths) + max_input_len = input_lengths.max() + W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32) + for b in range(B): + W[b] = guided_attention(input_lengths[b], max_input_len, + target_lengths[b], max_target_len, g).T + return W + + +class TTSLoss(object): + def __init__(self, + masked_weight=0.0, + priority_weight=0.0, + binary_divergence_weight=0.0, + guided_attention_sigma=0.2): + self.masked_weight = masked_weight + self.priority_weight = priority_weight + self.binary_divergence_weight = binary_divergence_weight + self.guided_attention_sigma = guided_attention_sigma + + def l1_loss(self, prediction, target, mask, priority_bin=None): + abs_diff = fluid.layers.abs(prediction - target) + + # basic mask-weighted l1 loss + w = self.masked_weight + if w > 0 and mask is not None: + base_l1_loss = w * masked_mean(abs_diff, mask) + ( + 1 - w) * fluid.layers.reduce_mean(abs_diff) + else: + base_l1_loss = fluid.layers.reduce_mean(abs_diff) + + if self.priority_weight > 0 and priority_bin is not None: + # mask-weighted priority channels' l1-loss + priority_abs_diff = fluid.layers.slice( + abs_diff, axes=[1], starts=[0], ends=[priority_bin]) + if w > 0 and mask is not None: + priority_loss = w * masked_mean(priority_abs_diff, mask) + ( + 1 - w) * fluid.layers.reduce_mean(priority_abs_diff) + else: + priority_loss = fluid.layers.reduce_mean(priority_abs_diff) + + # priority weighted sum + p = self.priority_weight + loss = p * priority_loss + (1 - p) * base_l1_loss + else: + loss = base_l1_loss + return loss + + def binary_divergence(self, prediction, target, mask): + flattened_prediction = fluid.layers.reshape(prediction, [-1, 1]) + flattened_target = fluid.layers.reshape(target, [-1, 1]) + flattened_loss = fluid.layers.log_loss( + flattened_prediction, flattened_target, epsilon=1e-8) + bin_div = fluid.layers.reshape(flattened_loss, prediction.shape) + + w = self.masked_weight + if w > 0 and mask is not None: + loss = w * masked_mean(bin_div, mask) + ( + 1 - w) * fluid.layers.reduce_mean(bin_div) + else: + loss = fluid.layers.reduce_mean(bin_div) + return loss + + @staticmethod + def done_loss(done_hat, done): + flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1]) + flat_done = fluid.layers.reshape(done, [-1, 1]) + loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8) + loss = fluid.layers.reduce_mean(loss) + return loss + + def attention_loss(self, predicted_attention, input_lengths, + target_lengths): + """ + Given valid encoder_lengths and decoder_lengths, compute a diagonal + guide, and compute loss from the predicted attention and the guide. + + Args: + predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the + alignment tensor, where B means batch size, T_dec means number + of time steps of the decoder, T_enc means the number of time + steps of the encoder, * means other possible dimensions. + input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths + (time steps) of encoder outputs. + target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64, + valid lengths (time steps) of decoder outputs. + + Returns: + loss (Variable): Shape(1, ) attention loss. + """ + n_attention, batch_size, max_target_len, max_input_len = ( + predicted_attention.shape) + soft_mask = guided_attentions(input_lengths, target_lengths, + max_target_len, + self.guided_attention_sigma) + soft_mask_ = dg.to_variable(soft_mask) + loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_) + return loss diff --git a/parakeet/modules/modules.py b/parakeet/modules/modules.py new file mode 100644 index 0000000..3ae95d7 --- /dev/null +++ b/parakeet/modules/modules.py @@ -0,0 +1,458 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + +import numpy as np + +import conv +import weight_norm as weight_norm + + +def FC(name_scope, + in_features, + size, + num_flatten_dims=1, + dropout=0.0, + epsilon=1e-30, + act=None, + is_test=False, + dtype="float32"): + """ + A special Linear Layer, when it is used with dropout, the weight is + initialized as normal(0, std=np.sqrt((1-dropout) / in_features)) + """ + + # stds + if isinstance(in_features, int): + in_features = [in_features] + stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features] + weight_inits = [ + fluid.initializer.NormalInitializer(scale=std) for std in stds + ] + bias_init = fluid.initializer.ConstantInitializer(0.0) + + # param attrs + weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits] + bias_attr = fluid.ParamAttr(initializer=bias_init) + + layer = weight_norm.FC(name_scope, + size, + num_flatten_dims=num_flatten_dims, + param_attr=weight_attrs, + bias_attr=bias_attr, + act=act, + dtype=dtype) + return layer + + +def Conv1D(name_scope, + in_channels, + num_filters, + filter_size=3, + dilation=1, + groups=None, + causal=False, + std_mul=1.0, + dropout=0.0, + use_cudnn=True, + act=None, + dtype="float32"): + """ + A special Conv1D Layer, when it is used with dropout, the weight is + initialized as + normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features))) + """ + # std + std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels)) + weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std) + bias_init = fluid.initializer.ConstantInitializer(0.0) + + # param attrs + weight_attr = fluid.ParamAttr(initializer=weight_init) + bias_attr = fluid.ParamAttr(initializer=bias_init) + + layer = conv.Conv1D( + name_scope, + in_channels, + num_filters, + filter_size, + dilation, + groups=groups, + causal=causal, + param_attr=weight_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + return layer + + +def Embedding(name_scope, + num_embeddings, + embed_dim, + is_sparse=False, + is_distributed=False, + padding_idx=None, + std=0.01, + dtype="float32"): + # param attrs + weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=std)) + layer = dg.Embedding( + name_scope, (num_embeddings, embed_dim), + padding_idx=padding_idx, + param_attr=weight_attr, + dtype=dtype) + return layer + + +class Conv1DGLU(dg.Layer): + """ + A Convolution 1D block with GLU activation. It also applys dropout for the + input x. It fuses speaker embeddings through a FC activated by softsign. It + has residual connection from the input x, and scale the output by + np.sqrt(0.5). + """ + + def __init__(self, + name_scope, + n_speakers, + speaker_dim, + in_channels, + num_filters, + filter_size, + dilation, + std_mul=4.0, + dropout=0.0, + causal=False, + residual=True, + dtype="float32"): + super(Conv1DGLU, self).__init__(name_scope, dtype=dtype) + + # conv spec + self.in_channels = in_channels + self.n_speakers = n_speakers + self.speaker_dim = speaker_dim + self.num_filters = num_filters + self.filter_size = filter_size + self.dilation = dilation + self.causal = causal + self.residual = residual + + # weight init and dropout + self.std_mul = std_mul + self.dropout = dropout + + if residual: + assert ( + in_channels == num_filters + ), "this block uses residual connection"\ + "the input_channes should equals num_filters" + + self.conv = Conv1D( + self.full_name(), + in_channels, + 2 * num_filters, + filter_size, + dilation, + causal=causal, + std_mul=std_mul, + dropout=dropout, + dtype=dtype) + + if n_speakers > 1: + assert (speaker_dim is not None + ), "speaker embed should not be null in multi-speaker case" + self.fc = Conv1D( + self.full_name(), + speaker_dim, + num_filters, + filter_size=1, + dilation=1, + causal=False, + act="softsign", + dtype=dtype) + + def forward(self, x, speaker_embed_bc1t=None): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU + layer, where B means batch_size, C_in means the input channels + T means input time steps. + speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded + speaker embed, where C_sp means speaker embedding size. Note + that when using residual connection, the Conv1DGLU does not + change the number of channels, so out channels equals input + channels. + + Returns: + x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where + C_out means the output channels of Conv1DGLU. + """ + + residual = x + x = fluid.layers.dropout( + x, self.dropout, dropout_implementation="upscale_in_train") + x = self.conv(x) + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + if speaker_embed_bc1t is not None: + sp = self.fc(speaker_embed_bc1t) + content = content + sp + + # glu + x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) + + if self.residual: + x = fluid.layers.scale(x + residual, np.sqrt(0.5)) + return x + + def add_input(self, x, speaker_embed_bc11=None): + """ + Inputs: + x: shape(B, num_filters, 1, time_steps) + speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps) + + Outputs: + out: shape(B, num_filters, 1, time_steps), where time_steps = 1 + """ + + residual = x + + # add step input and produce step output + x = fluid.layers.dropout( + x, self.dropout, dropout_implementation="upscale_in_train") + x = self.conv.add_input(x) + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + if speaker_embed_bc11 is not None: + sp = self.fc(speaker_embed_bc11) + content = content + sp + + x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) + + if self.residual: + x = fluid.layers.scale(x + residual, np.sqrt(0.5)) + return x + + +def Conv1DTranspose(name_scope, + in_channels, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + groups=None, + std_mul=1.0, + dropout=0.0, + use_cudnn=True, + act=None, + dtype="float32"): + std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size)) + weight_init = fluid.initializer.NormalInitializer(scale=std) + weight_attr = fluid.ParamAttr(initializer=weight_init) + bias_init = fluid.initializer.ConstantInitializer(0.0) + bias_attr = fluid.ParamAttr(initializer=bias_init) + layer = conv.Conv1DTranspose( + name_scope, + in_channels, + num_filters, + filter_size, + padding=padding, + stride=stride, + dilation=dilation, + groups=groups, + param_attr=weight_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + return layer + + +def compute_position_embedding(rad): + # rad is a transposed radius, shape(embed_dim, n_vocab) + embed_dim, n_vocab = rad.shape + + even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32")) + odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32")) + + even_rads = fluid.layers.gather(rad, even_dims) + odd_rads = fluid.layers.gather(rad, odd_dims) + + sines = fluid.layers.sin(even_rads) + cosines = fluid.layers.cos(odd_rads) + + temp = fluid.layers.scatter(rad, even_dims, sines) + out = fluid.layers.scatter(temp, odd_dims, cosines) + out = fluid.layers.transpose(out, perm=[1, 0]) + return out + + +def position_encoding_init(n_position, + d_pos_vec, + position_rate=1.0, + sinusoidal=True): + """ Init the sinusoid position encoding table """ + + # keep idx 0 for padding token position encoding zero vector + position_enc = np.array([[ + position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) + for i in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + + if sinusoidal: + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + + return position_enc + + +class PositionEmbedding(dg.Layer): + def __init__(self, + name_scope, + n_position, + d_pos_vec, + position_rate=1.0, + is_sparse=False, + is_distributed=False, + param_attr=None, + max_norm=None, + padding_idx=None, + dtype="float32"): + super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) + self.embed = dg.Embedding( + self.full_name(), + size=(n_position, d_pos_vec), + is_sparse=is_sparse, + is_distributed=is_distributed, + padding_idx=None, + param_attr=param_attr, + dtype=dtype) + self.set_weight( + position_encoding_init( + n_position, + d_pos_vec, + position_rate=position_rate, + sinusoidal=False).astype(dtype)) + + self._is_sparse = is_sparse + self._is_distributed = is_distributed + self._remote_prefetch = self._is_sparse and (not self._is_distributed) + if self._remote_prefetch: + assert self._is_sparse is True and self._is_distributed is False + + self._padding_idx = (-1 if padding_idx is None else padding_idx if + padding_idx >= 0 else (n_position + padding_idx)) + self._position_rate = position_rate + self._max_norm = max_norm + self._dtype = dtype + + def set_weight(self, array): + assert self.embed._w.shape == list(array.shape), "shape does not match" + self.embed._w._ivar.value().get_tensor().set( + array, fluid.framework._current_expected_place()) + + def forward(self, indices, speaker_position_rate=None): + """ + Args: + indices (Variable): Shape (B, T, 1), dtype: int64, position + indices, where B means the batch size, T means the time steps. + speaker_position_rate (Variable | float, optional), position + rate. It can be a float point number or a Variable with + shape (1,), then this speaker_position_rate is used for every + example. It can also be a Variable with shape (B, 1), which + contains a speaker position rate for each speaker. + Returns: + out (Variable): Shape(B, C_pos), position embedding, where C_pos + means position embedding size. + """ + rad = fluid.layers.transpose(self.embed._w, perm=[1, 0]) + batch_size = indices.shape[0] + + if speaker_position_rate is None: + weight = compute_position_embedding(rad) + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="lookup_table", + inputs={"Ids": indices, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": + self._padding_idx, # special value for lookup table op + }) + return out + + elif (np.isscalar(speaker_position_rate) or + isinstance(speaker_position_rate, fluid.framework.Variable) and + speaker_position_rate.shape == [1, 1]): + # # make a weight + # scale the weight (the operand for sin & cos) + if np.isscalar(speaker_position_rate): + scaled_rad = fluid.layers.scale(rad, speaker_position_rate) + else: + scaled_rad = fluid.layers.elementwise_mul( + rad, speaker_position_rate[0]) + weight = compute_position_embedding(scaled_rad) + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="lookup_table", + inputs={"Ids": indices, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": + self._padding_idx, # special value for lookup table op + }) + return out + + elif np.prod(speaker_position_rate.shape) > 1: + assert speaker_position_rate.shape == [batch_size, 1] + outputs = [] + for i in range(batch_size): + rate = speaker_position_rate[i] # rate has shape [1] + scaled_rad = fluid.layers.elementwise_mul(rad, rate) + weight = compute_position_embedding(scaled_rad) + out = self._helper.create_variable_for_type_inference( + self._dtype) + sequence = indices[i] + self._helper.append_op( + type="lookup_table", + inputs={"Ids": sequence, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": -1, + }) + outputs.append(out) + out = fluid.layers.stack(outputs) + return out + else: + raise Exception("Then you can just use position rate at init") diff --git a/parakeet/modules/weight_norm.py b/parakeet/modules/weight_norm.py new file mode 100644 index 0000000..cbb0d03 --- /dev/null +++ b/parakeet/modules/weight_norm.py @@ -0,0 +1,863 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numpy as np +from six.moves import reduce + +from copy import deepcopy + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg +from paddle.fluid import core +from paddle.fluid.layers import utils +from paddle.fluid.framework import Variable +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer + + +def _norm(p, dim): + """Computes the norm over all dimensions except dim. + It differs from pytorch implementation that it does not keep dim. + This difference is related with the broadcast mechanism in paddle. + Read elementeise_mul for more. + """ + + if dim is None: + return np.linalg.norm(p, ord=2, axis=None) + elif dim == 0: + p = np.reshape(p, newshape=(p.shape[0], -1)) + return np.linalg.norm(p, ord=2, axis=1) + elif dim == p.ndim - 1: + p = np.reshape(p, newshape=(-1, p.shape[-1])) + return np.linalg.norm(p, ord=2, axis=0) + else: + perm = list(range(p.ndim)) + perm[0] = dim + perm[dim] = 0 + return _norm(np.transpose(p, axes=perm)) + + +class FC(dg.Layer): + """ + **Fully Connected Layer** + + This function creates a fully connected layer in the network. It can take + one or multiple tensors as its inputs(input can be a list of Variable, see + Args in detail). It creates a pair of variables called (magnitude(g), + direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected + weight matrix from each input unit to each output unit. + The fully connected layer multiplies each input tensor + with its corresponding weight to produce an output Tensor with shape [M, `size`], + where M is batch size. If multiple input tensors are given, the results of + multiple output tensors with shape [M, `size`] will be summed up. If bias_attr + is not None, a bias variable will be created and added to the output. + Finally, if activation is not None, it will be applied to the output as well. + + When the input is single tensor: + + .. math:: + + Out = Act({X(normalize(V)g) + b}) + + When the input are multiple tensors: + + .. math:: + + Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b}) + + In the above equation: + + * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable. + * :math:`X_i`: The i-th input tensor. + * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor. + * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor. + * :math:`b`: The bias parameter created by this layer (if needed). + * :math:`Act`: The activation function. + * :math:`Out`: The output tensor. + + See below for an example. + + .. code-block:: text + + Given: + data_1.data = [[[0.1, 0.2], + [0.3, 0.4]]] + data_1.shape = (1, 2, 2) # 1 is batch_size + + data_2 = [[[0.1, 0.2, 0.3]]] + data_2.shape = (1, 1, 3) + + out = fluid.layers.fc(input=[data_1, data_2], size=2) + + Then: + out.data = [[0.18669507, 0.1893476]] + out.shape = (1, 2) + + Args: + name_scope(str): The name of this class. + size(int): The number of output units in this layer. + num_flatten_dims (int): The fc layer can accept an input tensor with more than + two dimensions. If this happens, the multidimensional tensor will first be flattened + into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input + tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) + dimensions will be flatten to form the first dimension of the final matrix (height of + the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to + form the second dimension of the final matrix (width of the matrix). For example, suppose + `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. + Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1 + param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable + parameters/weights of this layer. + bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias + of this layer. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + act (str|None): Activation to be applied to the output of this layer. + is_test(bool): A flag indicating whether execution is in test phase. Default: False + dtype(str): Dtype used for weight + + Raises: + ValueError: If rank of the input tensor is less than 2. + + Examples: + .. code-block:: python + + from paddle.fluid.dygraph.base import to_variable + import paddle.fluid as fluid + from paddle.fluid.dygraph import FC + import numpy as np + + data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32') + with fluid.dygraph.guard(): + fc = FC( "fc", 64, num_flatten_dims=2) + data = to_variable( data ) + conv = fc( data ) + + """ + + def __init__(self, + name_scope, + size, + num_flatten_dims=1, + epsilon=1e-30, + param_attr=None, + bias_attr=None, + act=None, + is_test=False, + dtype="float32"): + super(FC, self).__init__(name_scope, dtype) + + self._size = size + self._num_flatten_dims = num_flatten_dims + self._epsilon = epsilon + self._dtype = dtype + self._param_attr = param_attr + self._bias_attr = bias_attr + self._act = act + self.__g = list() + self.__v = list() + + @property + def _v(self, i=0): + return self.__v[i] + + @property + def _g(self, i=0): + return self.__g[i] + + @_v.setter + def _v(self, value, i=0): + assert isinstance(value, Parameter) + self.__v[i] = value + + @_g.setter + def _g(self, value, i=0): + assert isinstance(value, Parameter) + self.__g[i] = value + + def _build_once(self, input): + i = 0 + for inp, param in self._helper.iter_inputs_and_params(input, + self._param_attr): + input_shape = inp.shape + + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], + 1) + ] + [self._size] + self.__v.append( + self.add_parameter( + "_v%d" % i, + self.create_parameter( + attr=param, + shape=param_shape, + dtype=self._dtype, + is_bias=False))) + + magnitude_shape = param_shape[1:] + magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0) + + self.__g.append( + self.add_parameter( + "_g%d" % i, + self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + magnitude_value)), + shape=magnitude_shape, + dtype=self._dtype, + is_bias=False))) + i += 1 + + size = list([self._size]) + self._b = self.create_parameter( + attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True) + + def forward(self, input): + mul_results = list() + i = 0 + for inp, param in self._helper.iter_inputs_and_params(input, + self._param_attr): + v_norm = self._helper.create_variable_for_type_inference( + self._dtype) + v_normalized = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="norm", + inputs={"X": self.__v[i]}, + outputs={"Out": v_normalized, + "Norm": v_norm}, + attrs={"axis": 0, + "epsilon": self._epsilon}) + weight = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="elementwise_mul", + inputs={"X": [v_normalized], + "Y": [self.__g[i]]}, + outputs={"Out": [weight]}, + attrs={"axis": 1}) + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": inp, + "Y": weight}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + i += 1 + mul_results.append(tmp) + + if len(mul_results) == 1: + pre_bias = mul_results[0] + else: + pre_bias = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": mul_results}, + outputs={"Out": pre_bias}, + attrs={"use_mkldnn": False}) + + if self._b: + pre_activation = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type="elementwise_add", + inputs={"X": [pre_bias], + "Y": [self._b]}, + outputs={"Out": [pre_activation]}, + attrs={"axis": self._num_flatten_dims}) + else: + pre_activation = pre_bias + # Currently, we don't support inplace in dygraph mode + return self._helper.append_activation(pre_activation, act=self._act) + + +class Conv2D(dg.Layer): + """ + The convolution2D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input and + Output are in NCHW format, where N is batch size, C is the number of + channels, H is the height of the feature, and W is the width of the feature. + Filter is in MCHW format, where M is the number of output image channels, + C is the number of input image channels, H is the height of the filter, + and W is the width of the filter. If the groups is greater than 1, + C will equal the number of input image channels divided by the groups. + Please refer to UFLDL's `convolution + ` + for more detials. + If bias attribution and activation type are provided, bias is added to the + output of the convolution, and the corresponding activation function is + applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma ((Vg) \\ast X + b) + + Where: + + * :math:`X`: Input value, a tensor with NCHW format. + * :math:`V`: Filter direction value, a tensor with MCHW format. + * :math:`g`: Filter magnitude value, a tensor with M format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + + Args: + name_scope(str) : The name for this class. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv2d. If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with :math:`Normal(0.0, std)`, + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + from paddle.fluid.dygraph.base import to_variable + import paddle.fluid as fluid + from paddle.fluid.dygraph import Conv2D + import numpy as np + + data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32') + with fluid.dygraph.guard(): + conv2d = Conv2D( "conv2d", 2, 3) + data = to_variable( data ) + conv = conv2d( data ) + + """ + + def __init__(self, + name_scope, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + epsilon=1e-30, + dtype="float32"): + assert param_attr is not False, "param_attr should not be False here." + super(Conv2D, self).__init__(name_scope, dtype) + self._groups = groups + self._stride = utils.convert_to_list(stride, 2, "stride") + self._padding = utils.convert_to_list(padding, 2, "padding") + self._dilation = utils.convert_to_list(dilation, 2, "dilation") + self._act = act + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + self._use_cudnn = use_cudnn + self._filter_size = filter_size + self._num_filters = num_filters + self._param_attr = param_attr + self._bias_attr = bias_attr + self._epsilon = epsilon + self._dtype = dtype + # if (self._num_channels == self._groups and + # num_filters % self._num_channels == 0 and not self._use_cudnn): + # self._l_type = 'depthwise_conv2d' + # else: + # TODO(jiabin): recover the usage of depthwise_conv2d when it's + # kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275 + self._l_type = "conv2d" + + def _build_once(self, input): + self._num_channels = input.shape[1] + if self._groups is None: + num_filter_channels = self._num_channels + else: + if self._num_channels % self._groups != 0: + raise ValueError("num_channels must be divisible by groups.") + num_filter_channels = self._num_channels // self._groups + filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size") + filter_shape = [self._num_filters, int(num_filter_channels) + ] + filter_size + + def _get_default_param_initializer(): + filter_elem_num = filter_size[0] * filter_size[ + 1] * self._num_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std, 0) + + # weight_v + self._filter_param_v = self.create_parameter( + attr=self._param_attr, + shape=filter_shape, + dtype=self._dtype, + default_initializer=_get_default_param_initializer()) + + # weight_g + norm_value = _norm( + self._filter_param_v.numpy(), dim=0) # CAUTION: hard-code + self._filter_param_g = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + norm_value)), + shape=norm_value.shape, + dtype=self._dtype, + default_initializer=_get_default_param_initializer()) + + if self._use_cudnn: + self.create_variable( + name="kCUDNNFwdAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + self.create_variable( + name="kCUDNNBwdDataAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + self.create_variable( + name="kCUDNNBwdFilterAlgoCache", + persistable=True, + type=core.VarDesc.VarType.RAW) + + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + matrix = self._helper.create_variable_for_type_inference(self._dtype) + tmp = self._helper.create_variable_for_type_inference(self._dtype) + new_shape = [ + self._filter_param_v.shape[0], + reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1), + ] + + self._helper.append_op( + type="reshape2", + inputs={"X": self._filter_param_v}, + attrs={"shape": new_shape}, + outputs={"Out": matrix, + "XShape": tmp}) + + m_norm = self._helper.create_variable_for_type_inference(self._dtype) + m_normalized = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="norm", + inputs={"X": matrix}, + outputs={"Out": m_normalized, + "Norm": m_norm}, + attrs={"axis": 1, + "epsilon": self._epsilon}) + + v_normalized = self._helper.create_variable_for_type_inference( + self._dtype) + tmp2 = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="reshape2", + inputs={"X": m_normalized}, + attrs={"shape": self._filter_param_v.shape}, + outputs={"Out": v_normalized, + "XShape": tmp2}) + + filter_param = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="elementwise_mul", + inputs={"X": [v_normalized], + "Y": [self._filter_param_g]}, + outputs={"Out": [filter_param]}, + attrs={"axis": 0}, # CAUTION: hard-code + ) + + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type=self._l_type, + inputs={"Input": input, + "Filter": filter_param}, + outputs={"Output": pre_bias}, + attrs={ + "strides": self._stride, + "paddings": self._padding, + "dilations": self._dilation, + "groups": self._groups if self._groups else 1, + "use_cudnn": self._use_cudnn, + "use_mkldnn": False, + }) + + if self._bias_param is not None: + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type="elementwise_add", + inputs={"X": [pre_bias], + "Y": [self._bias_param]}, + outputs={"Out": [pre_act]}, + attrs={"axis": 1}) + else: + pre_act = pre_bias + + # Currently, we don't support inplace in dygraph mode + return self._helper.append_activation(pre_act, act=self._act) + + +class Conv2DTranspose(dg.Layer): + """ + **Convlution2D transpose layer** + + The convolution2D transpose layer calculates the output based on the input, + filter, and dilations, strides, paddings. Input(Input) and output(Output) + are in NCHW format. Where N is batch size, C is the number of channels, + H is the height of the feature, and W is the width of the feature. + Parameters(dilations, strides, paddings) are two elements. These two elements + represent height and width, respectively. The details of convolution transpose + layer, please refer to the following explanation and references + `therein `_. + If bias attribution and activation type are provided, bias is added to + the output of the convolution, and the corresponding activation function + is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma ((Vg) \\ast X + b) + + Where: + + * :math:`X`: Input value, a tensor with NCHW format. + * :math:`V`: Filter value, a tensor with MCHW format. + * :math:`g`: Filter value, a tensor with M format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ + W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\ + H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ + W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) + + Args: + name_scope(str): The name of this class. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain two integers, (image_H, image_W). None if use + filter_size, padding, and stride to calculate output_size. + if output_size and filter_size are specified at the same time, They + should follow the formula above. Default: None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size. Default: None. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv2d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups = 1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d_transpose + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True. + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. + + Returns: + Variable: The tensor variable storing the convolution transpose result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + with fluid.dygraph.guard(): + data = numpy.random.random((3, 32, 32)).astype('float32') + conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose( + 'Conv2DTranspose', num_filters=2, filter_size=3) + ret = conv2DTranspose(fluid.dygraph.base.to_variable(data)) + + """ + + def __init__(self, + name_scope, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + epsilon=1e-30, + act=None, + dtype="float32"): + super(Conv2DTranspose, self).__init__(name_scope, dtype) + assert (param_attr is not False + ), "param_attr should not be False in conv2d_transpose." + self._param_attr = param_attr + self._bias_attr = bias_attr + self._groups = groups + self._num_filters = num_filters + self._use_cudnn = use_cudnn + self._padding = padding + self._stride = stride + self._dilation = dilation + self._filter_size = filter_size + self._output_size = output_size + self._op_type = "conv2d_transpose" + self._epsilon = epsilon + + def _build_once(self, input): + input_channel = input.shape[1] + if (input_channel == self._groups and + self._num_filters == input_channel and not self._use_cudnn): + self._op_type = "depthwise_conv2d_transpose" + + if not isinstance(input, Variable): + raise TypeError("Input of conv2d_transpose must be Variable") + + self._padding = utils.convert_to_list(self._padding, 2, "padding") + self._stride = utils.convert_to_list(self._stride, 2, "stride") + self._dilation = utils.convert_to_list(self._dilation, 2, "dilation") + + if not isinstance(self._use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + if self._filter_size is None: + if self._output_size is None: + raise ValueError( + "output_size must be set when filter_size is None") + if isinstance(self._output_size, int): + self._output_size = [self._output_size, self._output_size] + + h_in = input.shape[2] + w_in = input.shape[3] + + filter_size_h = (self._output_size[0] - + (h_in - 1) * self._stride[0] + 2 * self._padding[0] + - 1) // self._dilation[0] + 1 + filter_size_w = (self._output_size[1] - + (w_in - 1) * self._stride[1] + 2 * self._padding[1] + - 1) // self._dilation[1] + 1 + self._filter_size = [filter_size_h, filter_size_w] + else: + self._filter_size = utils.convert_to_list( + self._filter_size, 2, "conv2d_transpose.filter_size") + + if self._output_size is None: + self._output_size = [] + elif isinstance(self._output_size, list) or isinstance( + self._output_size, int): + self._output_size = utils.convert_to_list(self._output_size, 2, + "output_size") + else: + raise ValueError("output_size should be list or int") + self._padding = utils.convert_to_list(self._padding, 2, "padding") + self._groups = 1 if self._groups is None else self._groups + filter_shape = [ + input_channel, + self._num_filters // self._groups, + ] + self._filter_size + + # img filter v (direction) + self._img_filter_v = self.create_parameter( + dtype=input.dtype, shape=filter_shape, attr=self._param_attr) + + # img filter g (magnitude) + img_filter_magnitude = _norm( + self._img_filter_v.numpy(), dim=0) # CAUTION: hard-code + self._img_filter_g = self.create_parameter( + dtype=input.dtype, + shape=img_filter_magnitude.shape, + attr=fluid.ParamAttr( + initializer=NumpyArrayInitializer(img_filter_magnitude))) + + self._img_bias = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + matrix = self._helper.create_variable_for_type_inference(self._dtype) + tmp = self._helper.create_variable_for_type_inference(self._dtype) + new_shape = [ + self._img_filter_v.shape[0], + reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1), + ] + + self._helper.append_op( + type="reshape2", + inputs={"X": self._img_filter_v}, + attrs={"shape": new_shape}, + outputs={"Out": matrix, + "XShape": tmp}) + + m_norm = self._helper.create_variable_for_type_inference(self._dtype) + m_normalized = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="norm", + inputs={"X": matrix}, + outputs={"Out": m_normalized, + "Norm": m_norm}, + attrs={"axis": 1, + "epsilon": self._epsilon}) + + v_normalized = self._helper.create_variable_for_type_inference( + self._dtype) + tmp2 = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="reshape2", + inputs={"X": m_normalized}, + attrs={"shape": self._img_filter_v.shape}, + outputs={"Out": v_normalized, + "XShape": tmp2}) + + img_filter = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="elementwise_mul", + inputs={"X": [v_normalized], + "Y": [self._img_filter_g]}, + outputs={"Out": [img_filter]}, + attrs={"axis": 0}, # CAUTION: hard-code + ) + + pre_bias = self._helper.create_variable_for_type_inference( + dtype=input.dtype) + self._helper.append_op( + type=self._op_type, + inputs={"Input": [input], + "Filter": [img_filter]}, + outputs={"Output": pre_bias}, + attrs={ + "output_size": self._output_size, + "strides": self._stride, + "paddings": self._padding, + "dilations": self._dilation, + "groups": self._groups, + "use_cudnn": self._use_cudnn, + }) + + if self._img_bias is not None: + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type="elementwise_add", + inputs={"X": [pre_bias], + "Y": [self._img_bias]}, + outputs={"Out": [pre_act]}, + attrs={"axis": 1}) + else: + pre_act = pre_bias + + out = self._helper.append_activation(pre_act) + return out