diff --git a/parakeet/modules/modules.py b/parakeet/modules/modules.py deleted file mode 100644 index 7aef463..0000000 --- a/parakeet/modules/modules.py +++ /dev/null @@ -1,612 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from paddle import fluid -import paddle.fluid.dygraph as dg - -import numpy as np - -from . import conv -from . import weight_norm - - -def FC(name_scope, - in_features, - size, - num_flatten_dims=1, - relu=False, - dropout=0.0, - epsilon=1e-30, - act=None, - is_test=False, - dtype="float32"): - """ - A special Linear Layer, when it is used with dropout, the weight is - initialized as normal(0, std=np.sqrt((1-dropout) / in_features)) - """ - - # stds - if isinstance(in_features, int): - in_features = [in_features] - - stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features] - if relu: - stds = [std * np.sqrt(2.0) for std in stds] - - weight_inits = [ - fluid.initializer.NormalInitializer(scale=std) for std in stds - ] - bias_init = fluid.initializer.ConstantInitializer(0.0) - - # param attrs - weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits] - bias_attr = fluid.ParamAttr(initializer=bias_init) - - layer = weight_norm.FC(name_scope, - size, - num_flatten_dims=num_flatten_dims, - param_attr=weight_attrs, - bias_attr=bias_attr, - act=act, - dtype=dtype) - return layer - - -def Conv1D(name_scope, - in_channels, - num_filters, - filter_size=3, - dilation=1, - groups=None, - causal=False, - std_mul=1.0, - dropout=0.0, - use_cudnn=True, - act=None, - dtype="float32"): - """ - A special Conv1D Layer, when it is used with dropout, the weight is - initialized as - normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features))) - """ - # std - std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels)) - weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std) - bias_init = fluid.initializer.ConstantInitializer(0.0) - - # param attrs - weight_attr = fluid.ParamAttr(initializer=weight_init) - bias_attr = fluid.ParamAttr(initializer=bias_init) - - layer = conv.Conv1D( - name_scope, - in_channels, - num_filters, - filter_size, - dilation, - groups=groups, - causal=causal, - param_attr=weight_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - return layer - - -def Embedding(name_scope, - num_embeddings, - embed_dim, - is_sparse=False, - is_distributed=False, - padding_idx=None, - std=0.01, - dtype="float32"): - # param attrs - weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( - scale=std)) - layer = dg.Embedding( - name_scope, (num_embeddings, embed_dim), - padding_idx=padding_idx, - param_attr=weight_attr, - dtype=dtype) - return layer - - -class Conv1DGLU(dg.Layer): - """ - A Convolution 1D block with GLU activation. It also applys dropout for the - input x. It fuses speaker embeddings through a FC activated by softsign. It - has residual connection from the input x, and scale the output by - np.sqrt(0.5). - """ - - def __init__(self, - name_scope, - n_speakers, - speaker_dim, - in_channels, - num_filters, - filter_size, - dilation, - std_mul=4.0, - dropout=0.0, - causal=False, - residual=True, - dtype="float32"): - super(Conv1DGLU, self).__init__(name_scope, dtype=dtype) - - # conv spec - self.in_channels = in_channels - self.n_speakers = n_speakers - self.speaker_dim = speaker_dim - self.num_filters = num_filters - self.filter_size = filter_size - self.dilation = dilation - self.causal = causal - self.residual = residual - - # weight init and dropout - self.std_mul = std_mul - self.dropout = dropout - - if residual: - assert ( - in_channels == num_filters - ), "this block uses residual connection"\ - "the input_channes should equals num_filters" - - self.conv = Conv1D( - self.full_name(), - in_channels, - 2 * num_filters, - filter_size, - dilation, - causal=causal, - std_mul=std_mul, - dropout=dropout, - dtype=dtype) - - if n_speakers > 1: - assert (speaker_dim is not None - ), "speaker embed should not be null in multi-speaker case" - self.fc = Conv1D( - self.full_name(), - speaker_dim, - num_filters, - filter_size=1, - dilation=1, - causal=False, - act="softsign", - dtype=dtype) - - def forward(self, x, speaker_embed_bc1t=None): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU - layer, where B means batch_size, C_in means the input channels - T means input time steps. - speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded - speaker embed, where C_sp means speaker embedding size. Note - that when using residual connection, the Conv1DGLU does not - change the number of channels, so out channels equals input - channels. - - Returns: - x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where - C_out means the output channels of Conv1DGLU. - """ - - residual = x - x = fluid.layers.dropout( - x, self.dropout, dropout_implementation="upscale_in_train") - x = self.conv(x) - - content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) - - if speaker_embed_bc1t is not None: - sp = self.fc(speaker_embed_bc1t) - content = content + sp - - # glu - x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) - - if self.residual: - x = fluid.layers.scale(x + residual, np.sqrt(0.5)) - return x - - def add_input(self, x, speaker_embed_bc11=None): - """ - Inputs: - x: shape(B, num_filters, 1, time_steps) - speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps) - - Outputs: - out: shape(B, num_filters, 1, time_steps), where time_steps = 1 - """ - - residual = x - - # add step input and produce step output - x = fluid.layers.dropout( - x, self.dropout, dropout_implementation="upscale_in_train") - x = self.conv.add_input(x) - - content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) - - if speaker_embed_bc11 is not None: - sp = self.fc(speaker_embed_bc11) - content = content + sp - - x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) - - if self.residual: - x = fluid.layers.scale(x + residual, np.sqrt(0.5)) - return x - - -def Conv1DTranspose(name_scope, - in_channels, - num_filters, - filter_size, - padding=0, - stride=1, - dilation=1, - groups=None, - std_mul=1.0, - dropout=0.0, - use_cudnn=True, - act=None, - dtype="float32"): - std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size)) - weight_init = fluid.initializer.NormalInitializer(scale=std) - weight_attr = fluid.ParamAttr(initializer=weight_init) - bias_init = fluid.initializer.ConstantInitializer(0.0) - bias_attr = fluid.ParamAttr(initializer=bias_init) - layer = conv.Conv1DTranspose( - name_scope, - in_channels, - num_filters, - filter_size, - padding=padding, - stride=stride, - dilation=dilation, - groups=groups, - param_attr=weight_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - return layer - - -def compute_position_embedding(rad): - # rad is a transposed radius, shape(embed_dim, n_vocab) - embed_dim, n_vocab = rad.shape - - even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32")) - odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32")) - - even_rads = fluid.layers.gather(rad, even_dims) - odd_rads = fluid.layers.gather(rad, odd_dims) - - sines = fluid.layers.sin(even_rads) - cosines = fluid.layers.cos(odd_rads) - - temp = fluid.layers.scatter(rad, even_dims, sines) - out = fluid.layers.scatter(temp, odd_dims, cosines) - out = fluid.layers.transpose(out, perm=[1, 0]) - return out - - -def position_encoding_init(n_position, - d_pos_vec, - position_rate=1.0, - sinusoidal=True): - """ Init the sinusoid position encoding table """ - - # keep idx 0 for padding token position encoding zero vector - position_enc = np.array([[ - position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) - for i in range(d_pos_vec) - ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) - - if sinusoidal: - position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i - position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 - - return position_enc - - -class PositionEmbedding(dg.Layer): - def __init__(self, - name_scope, - n_position, - d_pos_vec, - position_rate=1.0, - is_sparse=False, - is_distributed=False, - param_attr=None, - max_norm=None, - padding_idx=None, - dtype="float32"): - super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) - self.embed = dg.Embedding( - self.full_name(), - size=(n_position, d_pos_vec), - is_sparse=is_sparse, - is_distributed=is_distributed, - padding_idx=None, - param_attr=param_attr, - dtype=dtype) - self.set_weight( - position_encoding_init( - n_position, - d_pos_vec, - position_rate=position_rate, - sinusoidal=False).astype(dtype)) - - self._is_sparse = is_sparse - self._is_distributed = is_distributed - self._remote_prefetch = self._is_sparse and (not self._is_distributed) - if self._remote_prefetch: - assert self._is_sparse is True and self._is_distributed is False - - self._padding_idx = (-1 if padding_idx is None else padding_idx if - padding_idx >= 0 else (n_position + padding_idx)) - self._position_rate = position_rate - self._max_norm = max_norm - self._dtype = dtype - - def set_weight(self, array): - assert self.embed._w.shape == list(array.shape), "shape does not match" - self.embed._w._ivar.value().get_tensor().set( - array, fluid.framework._current_expected_place()) - - def forward(self, indices, speaker_position_rate=None): - """ - Args: - indices (Variable): Shape (B, T, 1), dtype: int64, position - indices, where B means the batch size, T means the time steps. - speaker_position_rate (Variable | float, optional), position - rate. It can be a float point number or a Variable with - shape (1,), then this speaker_position_rate is used for every - example. It can also be a Variable with shape (B, 1), which - contains a speaker position rate for each speaker. - Returns: - out (Variable): Shape(B, C_pos), position embedding, where C_pos - means position embedding size. - """ - rad = fluid.layers.transpose(self.embed._w, perm=[1, 0]) - batch_size = indices.shape[0] - - if speaker_position_rate is None: - weight = compute_position_embedding(rad) - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="lookup_table", - inputs={"Ids": indices, - "W": weight}, - outputs={"Out": out}, - attrs={ - "is_sparse": self._is_sparse, - "is_distributed": self._is_distributed, - "remote_prefetch": self._remote_prefetch, - "padding_idx": - self._padding_idx, # special value for lookup table op - }) - return out - - elif (np.isscalar(speaker_position_rate) or - isinstance(speaker_position_rate, fluid.framework.Variable) and - speaker_position_rate.shape == [1, 1]): - # # make a weight - # scale the weight (the operand for sin & cos) - if np.isscalar(speaker_position_rate): - scaled_rad = fluid.layers.scale(rad, speaker_position_rate) - else: - scaled_rad = fluid.layers.elementwise_mul( - rad, speaker_position_rate[0]) - weight = compute_position_embedding(scaled_rad) - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="lookup_table", - inputs={"Ids": indices, - "W": weight}, - outputs={"Out": out}, - attrs={ - "is_sparse": self._is_sparse, - "is_distributed": self._is_distributed, - "remote_prefetch": self._remote_prefetch, - "padding_idx": - self._padding_idx, # special value for lookup table op - }) - return out - - elif np.prod(speaker_position_rate.shape) > 1: - assert speaker_position_rate.shape == [batch_size, 1] - outputs = [] - for i in range(batch_size): - rate = speaker_position_rate[i] # rate has shape [1] - scaled_rad = fluid.layers.elementwise_mul(rad, rate) - weight = compute_position_embedding(scaled_rad) - out = self._helper.create_variable_for_type_inference( - self._dtype) - sequence = indices[i] - self._helper.append_op( - type="lookup_table", - inputs={"Ids": sequence, - "W": weight}, - outputs={"Out": out}, - attrs={ - "is_sparse": self._is_sparse, - "is_distributed": self._is_distributed, - "remote_prefetch": self._remote_prefetch, - "padding_idx": -1, - }) - outputs.append(out) - out = fluid.layers.stack(outputs) - return out - else: - raise Exception("Then you can just use position rate at init") - - -class Conv1D_GU(dg.Layer): - def __init__(self, - name_scope, - conditioner_dim, - in_channels, - num_filters, - filter_size, - dilation, - causal=False, - residual=True, - dtype="float32"): - super(Conv1D_GU, self).__init__(name_scope, dtype=dtype) - - self.conditioner_dim = conditioner_dim - self.in_channels = in_channels - self.num_filters = num_filters - self.filter_size = filter_size - self.dilation = dilation - self.causal = causal - self.residual = residual - - if residual: - assert ( - in_channels == num_filters - ), "this block uses residual connection"\ - "the input_channels should equals num_filters" - - self.conv = Conv1D( - self.full_name(), - in_channels, - 2 * num_filters, - filter_size, - dilation, - causal=causal, - dtype=dtype) - - self.fc = Conv1D( - self.full_name(), - conditioner_dim, - 2 * num_filters, - filter_size=1, - dilation=1, - causal=False, - dtype=dtype) - - def forward(self, x, skip=None, conditioner=None): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU - layer, where B means batch_size, C_in means the input channels - T means input time steps. - skip (Variable): Shape(B, C_in, 1, T), skip connection. - conditioner (Variable): Shape(B, C_con, 1, T), expanded mel - conditioner, where C_con is conditioner hidden dim which - equals the num of mel bands. Note that when using residual - connection, the Conv1D_GU does not change the number of - channels, so out channels equals input channels. - Returns: - x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where - C_out means the output channels of Conv1D_GU. - skip (Variable): Shape(B, C_out, 1, T), skip connection. - """ - residual = x - x = self.conv(x) - - if conditioner is not None: - cond_bias = self.fc(conditioner) - x += cond_bias - - content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) - - # Gated Unit. - x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), - fluid.layers.tanh(content)) - - if skip is None: - skip = x - else: - skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) - - if self.residual: - x = fluid.layers.scale(residual + x, np.sqrt(0.5)) - - return x, skip - - def add_input(self, x, skip=None, conditioner=None): - """ - Inputs: - x: shape(B, num_filters, 1, time_steps) - skip: shape(B, num_filters, 1, time_steps), skip connection - conditioner: shape(B, conditioner_dim, 1, time_steps) - Outputs: - x: shape(B, num_filters, 1, time_steps), where time_steps = 1 - skip: skip connection, same shape as x - """ - residual = x - - # add step input and produce step output - x = self.conv.add_input(x) - - if conditioner is not None: - cond_bias = self.fc(conditioner) - x += cond_bias - - content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) - - # Gated Unit. - x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), - fluid.layers.tanh(content)) - - if skip is None: - skip = x - else: - skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) - - if self.residual: - x = fluid.layers.scale(residual + x, np.sqrt(0.5)) - - return x, skip - - -def Conv2DTranspose(name_scope, - num_filters, - filter_size, - padding=0, - stride=1, - dilation=1, - use_cudnn=True, - act=None, - dtype="float32"): - val = 1.0 / (filter_size[0] * filter_size[1]) - weight_init = fluid.initializer.ConstantInitializer(val) - weight_attr = fluid.ParamAttr(initializer=weight_init) - - layer = weight_norm.Conv2DTranspose( - name_scope, - num_filters, - filter_size=filter_size, - padding=padding, - stride=stride, - dilation=dilation, - param_attr=weight_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - - return layer