613 lines
20 KiB
Python
613 lines
20 KiB
Python
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import paddle
|
|
from paddle import fluid
|
|
import paddle.fluid.dygraph as dg
|
|
|
|
import numpy as np
|
|
|
|
from . import conv
|
|
from . import weight_norm
|
|
|
|
|
|
def FC(name_scope,
|
|
in_features,
|
|
size,
|
|
num_flatten_dims=1,
|
|
relu=False,
|
|
dropout=0.0,
|
|
epsilon=1e-30,
|
|
act=None,
|
|
is_test=False,
|
|
dtype="float32"):
|
|
"""
|
|
A special Linear Layer, when it is used with dropout, the weight is
|
|
initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
|
|
"""
|
|
|
|
# stds
|
|
if isinstance(in_features, int):
|
|
in_features = [in_features]
|
|
|
|
stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
|
|
if relu:
|
|
stds = [std * np.sqrt(2.0) for std in stds]
|
|
|
|
weight_inits = [
|
|
fluid.initializer.NormalInitializer(scale=std) for std in stds
|
|
]
|
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
|
|
|
# param attrs
|
|
weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
|
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
|
|
|
layer = weight_norm.FC(name_scope,
|
|
size,
|
|
num_flatten_dims=num_flatten_dims,
|
|
param_attr=weight_attrs,
|
|
bias_attr=bias_attr,
|
|
act=act,
|
|
dtype=dtype)
|
|
return layer
|
|
|
|
|
|
def Conv1D(name_scope,
|
|
in_channels,
|
|
num_filters,
|
|
filter_size=3,
|
|
dilation=1,
|
|
groups=None,
|
|
causal=False,
|
|
std_mul=1.0,
|
|
dropout=0.0,
|
|
use_cudnn=True,
|
|
act=None,
|
|
dtype="float32"):
|
|
"""
|
|
A special Conv1D Layer, when it is used with dropout, the weight is
|
|
initialized as
|
|
normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
|
|
"""
|
|
# std
|
|
std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
|
|
weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
|
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
|
|
|
# param attrs
|
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
|
|
|
layer = conv.Conv1D(
|
|
name_scope,
|
|
in_channels,
|
|
num_filters,
|
|
filter_size,
|
|
dilation,
|
|
groups=groups,
|
|
causal=causal,
|
|
param_attr=weight_attr,
|
|
bias_attr=bias_attr,
|
|
use_cudnn=use_cudnn,
|
|
act=act,
|
|
dtype=dtype)
|
|
return layer
|
|
|
|
|
|
def Embedding(name_scope,
|
|
num_embeddings,
|
|
embed_dim,
|
|
is_sparse=False,
|
|
is_distributed=False,
|
|
padding_idx=None,
|
|
std=0.01,
|
|
dtype="float32"):
|
|
# param attrs
|
|
weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
|
|
scale=std))
|
|
layer = dg.Embedding(
|
|
name_scope, (num_embeddings, embed_dim),
|
|
padding_idx=padding_idx,
|
|
param_attr=weight_attr,
|
|
dtype=dtype)
|
|
return layer
|
|
|
|
|
|
class Conv1DGLU(dg.Layer):
|
|
"""
|
|
A Convolution 1D block with GLU activation. It also applys dropout for the
|
|
input x. It fuses speaker embeddings through a FC activated by softsign. It
|
|
has residual connection from the input x, and scale the output by
|
|
np.sqrt(0.5).
|
|
"""
|
|
|
|
def __init__(self,
|
|
name_scope,
|
|
n_speakers,
|
|
speaker_dim,
|
|
in_channels,
|
|
num_filters,
|
|
filter_size,
|
|
dilation,
|
|
std_mul=4.0,
|
|
dropout=0.0,
|
|
causal=False,
|
|
residual=True,
|
|
dtype="float32"):
|
|
super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
|
|
|
|
# conv spec
|
|
self.in_channels = in_channels
|
|
self.n_speakers = n_speakers
|
|
self.speaker_dim = speaker_dim
|
|
self.num_filters = num_filters
|
|
self.filter_size = filter_size
|
|
self.dilation = dilation
|
|
self.causal = causal
|
|
self.residual = residual
|
|
|
|
# weight init and dropout
|
|
self.std_mul = std_mul
|
|
self.dropout = dropout
|
|
|
|
if residual:
|
|
assert (
|
|
in_channels == num_filters
|
|
), "this block uses residual connection"\
|
|
"the input_channes should equals num_filters"
|
|
|
|
self.conv = Conv1D(
|
|
self.full_name(),
|
|
in_channels,
|
|
2 * num_filters,
|
|
filter_size,
|
|
dilation,
|
|
causal=causal,
|
|
std_mul=std_mul,
|
|
dropout=dropout,
|
|
dtype=dtype)
|
|
|
|
if n_speakers > 1:
|
|
assert (speaker_dim is not None
|
|
), "speaker embed should not be null in multi-speaker case"
|
|
self.fc = Conv1D(
|
|
self.full_name(),
|
|
speaker_dim,
|
|
num_filters,
|
|
filter_size=1,
|
|
dilation=1,
|
|
causal=False,
|
|
act="softsign",
|
|
dtype=dtype)
|
|
|
|
def forward(self, x, speaker_embed_bc1t=None):
|
|
"""
|
|
Args:
|
|
x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
|
|
layer, where B means batch_size, C_in means the input channels
|
|
T means input time steps.
|
|
speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
|
|
speaker embed, where C_sp means speaker embedding size. Note
|
|
that when using residual connection, the Conv1DGLU does not
|
|
change the number of channels, so out channels equals input
|
|
channels.
|
|
|
|
Returns:
|
|
x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
|
|
C_out means the output channels of Conv1DGLU.
|
|
"""
|
|
|
|
residual = x
|
|
x = fluid.layers.dropout(
|
|
x, self.dropout, dropout_implementation="upscale_in_train")
|
|
x = self.conv(x)
|
|
|
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
|
|
|
if speaker_embed_bc1t is not None:
|
|
sp = self.fc(speaker_embed_bc1t)
|
|
content = content + sp
|
|
|
|
# glu
|
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
|
|
|
|
if self.residual:
|
|
x = fluid.layers.scale(x + residual, np.sqrt(0.5))
|
|
return x
|
|
|
|
def add_input(self, x, speaker_embed_bc11=None):
|
|
"""
|
|
Inputs:
|
|
x: shape(B, num_filters, 1, time_steps)
|
|
speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
|
|
|
|
Outputs:
|
|
out: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
|
"""
|
|
|
|
residual = x
|
|
|
|
# add step input and produce step output
|
|
x = fluid.layers.dropout(
|
|
x, self.dropout, dropout_implementation="upscale_in_train")
|
|
x = self.conv.add_input(x)
|
|
|
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
|
|
|
if speaker_embed_bc11 is not None:
|
|
sp = self.fc(speaker_embed_bc11)
|
|
content = content + sp
|
|
|
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
|
|
|
|
if self.residual:
|
|
x = fluid.layers.scale(x + residual, np.sqrt(0.5))
|
|
return x
|
|
|
|
|
|
def Conv1DTranspose(name_scope,
|
|
in_channels,
|
|
num_filters,
|
|
filter_size,
|
|
padding=0,
|
|
stride=1,
|
|
dilation=1,
|
|
groups=None,
|
|
std_mul=1.0,
|
|
dropout=0.0,
|
|
use_cudnn=True,
|
|
act=None,
|
|
dtype="float32"):
|
|
std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
|
|
weight_init = fluid.initializer.NormalInitializer(scale=std)
|
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
|
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
|
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
|
layer = conv.Conv1DTranspose(
|
|
name_scope,
|
|
in_channels,
|
|
num_filters,
|
|
filter_size,
|
|
padding=padding,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
groups=groups,
|
|
param_attr=weight_attr,
|
|
bias_attr=bias_attr,
|
|
use_cudnn=use_cudnn,
|
|
act=act,
|
|
dtype=dtype)
|
|
return layer
|
|
|
|
|
|
def compute_position_embedding(rad):
|
|
# rad is a transposed radius, shape(embed_dim, n_vocab)
|
|
embed_dim, n_vocab = rad.shape
|
|
|
|
even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
|
|
odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
|
|
|
|
even_rads = fluid.layers.gather(rad, even_dims)
|
|
odd_rads = fluid.layers.gather(rad, odd_dims)
|
|
|
|
sines = fluid.layers.sin(even_rads)
|
|
cosines = fluid.layers.cos(odd_rads)
|
|
|
|
temp = fluid.layers.scatter(rad, even_dims, sines)
|
|
out = fluid.layers.scatter(temp, odd_dims, cosines)
|
|
out = fluid.layers.transpose(out, perm=[1, 0])
|
|
return out
|
|
|
|
|
|
def position_encoding_init(n_position,
|
|
d_pos_vec,
|
|
position_rate=1.0,
|
|
sinusoidal=True):
|
|
""" Init the sinusoid position encoding table """
|
|
|
|
# keep idx 0 for padding token position encoding zero vector
|
|
position_enc = np.array([[
|
|
position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
|
|
for i in range(d_pos_vec)
|
|
] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
|
|
|
if sinusoidal:
|
|
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
|
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
|
|
|
return position_enc
|
|
|
|
|
|
class PositionEmbedding(dg.Layer):
|
|
def __init__(self,
|
|
name_scope,
|
|
n_position,
|
|
d_pos_vec,
|
|
position_rate=1.0,
|
|
is_sparse=False,
|
|
is_distributed=False,
|
|
param_attr=None,
|
|
max_norm=None,
|
|
padding_idx=None,
|
|
dtype="float32"):
|
|
super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
|
|
self.embed = dg.Embedding(
|
|
self.full_name(),
|
|
size=(n_position, d_pos_vec),
|
|
is_sparse=is_sparse,
|
|
is_distributed=is_distributed,
|
|
padding_idx=None,
|
|
param_attr=param_attr,
|
|
dtype=dtype)
|
|
self.set_weight(
|
|
position_encoding_init(
|
|
n_position,
|
|
d_pos_vec,
|
|
position_rate=position_rate,
|
|
sinusoidal=False).astype(dtype))
|
|
|
|
self._is_sparse = is_sparse
|
|
self._is_distributed = is_distributed
|
|
self._remote_prefetch = self._is_sparse and (not self._is_distributed)
|
|
if self._remote_prefetch:
|
|
assert self._is_sparse is True and self._is_distributed is False
|
|
|
|
self._padding_idx = (-1 if padding_idx is None else padding_idx if
|
|
padding_idx >= 0 else (n_position + padding_idx))
|
|
self._position_rate = position_rate
|
|
self._max_norm = max_norm
|
|
self._dtype = dtype
|
|
|
|
def set_weight(self, array):
|
|
assert self.embed._w.shape == list(array.shape), "shape does not match"
|
|
self.embed._w._ivar.value().get_tensor().set(
|
|
array, fluid.framework._current_expected_place())
|
|
|
|
def forward(self, indices, speaker_position_rate=None):
|
|
"""
|
|
Args:
|
|
indices (Variable): Shape (B, T, 1), dtype: int64, position
|
|
indices, where B means the batch size, T means the time steps.
|
|
speaker_position_rate (Variable | float, optional), position
|
|
rate. It can be a float point number or a Variable with
|
|
shape (1,), then this speaker_position_rate is used for every
|
|
example. It can also be a Variable with shape (B, 1), which
|
|
contains a speaker position rate for each speaker.
|
|
Returns:
|
|
out (Variable): Shape(B, C_pos), position embedding, where C_pos
|
|
means position embedding size.
|
|
"""
|
|
rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
|
|
batch_size = indices.shape[0]
|
|
|
|
if speaker_position_rate is None:
|
|
weight = compute_position_embedding(rad)
|
|
out = self._helper.create_variable_for_type_inference(self._dtype)
|
|
self._helper.append_op(
|
|
type="lookup_table",
|
|
inputs={"Ids": indices,
|
|
"W": weight},
|
|
outputs={"Out": out},
|
|
attrs={
|
|
"is_sparse": self._is_sparse,
|
|
"is_distributed": self._is_distributed,
|
|
"remote_prefetch": self._remote_prefetch,
|
|
"padding_idx":
|
|
self._padding_idx, # special value for lookup table op
|
|
})
|
|
return out
|
|
|
|
elif (np.isscalar(speaker_position_rate) or
|
|
isinstance(speaker_position_rate, fluid.framework.Variable) and
|
|
speaker_position_rate.shape == [1, 1]):
|
|
# # make a weight
|
|
# scale the weight (the operand for sin & cos)
|
|
if np.isscalar(speaker_position_rate):
|
|
scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
|
|
else:
|
|
scaled_rad = fluid.layers.elementwise_mul(
|
|
rad, speaker_position_rate[0])
|
|
weight = compute_position_embedding(scaled_rad)
|
|
out = self._helper.create_variable_for_type_inference(self._dtype)
|
|
self._helper.append_op(
|
|
type="lookup_table",
|
|
inputs={"Ids": indices,
|
|
"W": weight},
|
|
outputs={"Out": out},
|
|
attrs={
|
|
"is_sparse": self._is_sparse,
|
|
"is_distributed": self._is_distributed,
|
|
"remote_prefetch": self._remote_prefetch,
|
|
"padding_idx":
|
|
self._padding_idx, # special value for lookup table op
|
|
})
|
|
return out
|
|
|
|
elif np.prod(speaker_position_rate.shape) > 1:
|
|
assert speaker_position_rate.shape == [batch_size, 1]
|
|
outputs = []
|
|
for i in range(batch_size):
|
|
rate = speaker_position_rate[i] # rate has shape [1]
|
|
scaled_rad = fluid.layers.elementwise_mul(rad, rate)
|
|
weight = compute_position_embedding(scaled_rad)
|
|
out = self._helper.create_variable_for_type_inference(
|
|
self._dtype)
|
|
sequence = indices[i]
|
|
self._helper.append_op(
|
|
type="lookup_table",
|
|
inputs={"Ids": sequence,
|
|
"W": weight},
|
|
outputs={"Out": out},
|
|
attrs={
|
|
"is_sparse": self._is_sparse,
|
|
"is_distributed": self._is_distributed,
|
|
"remote_prefetch": self._remote_prefetch,
|
|
"padding_idx": -1,
|
|
})
|
|
outputs.append(out)
|
|
out = fluid.layers.stack(outputs)
|
|
return out
|
|
else:
|
|
raise Exception("Then you can just use position rate at init")
|
|
|
|
|
|
class Conv1D_GU(dg.Layer):
|
|
def __init__(self,
|
|
name_scope,
|
|
conditioner_dim,
|
|
in_channels,
|
|
num_filters,
|
|
filter_size,
|
|
dilation,
|
|
causal=False,
|
|
residual=True,
|
|
dtype="float32"):
|
|
super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)
|
|
|
|
self.conditioner_dim = conditioner_dim
|
|
self.in_channels = in_channels
|
|
self.num_filters = num_filters
|
|
self.filter_size = filter_size
|
|
self.dilation = dilation
|
|
self.causal = causal
|
|
self.residual = residual
|
|
|
|
if residual:
|
|
assert (
|
|
in_channels == num_filters
|
|
), "this block uses residual connection"\
|
|
"the input_channels should equals num_filters"
|
|
|
|
self.conv = Conv1D(
|
|
self.full_name(),
|
|
in_channels,
|
|
2 * num_filters,
|
|
filter_size,
|
|
dilation,
|
|
causal=causal,
|
|
dtype=dtype)
|
|
|
|
self.fc = Conv1D(
|
|
self.full_name(),
|
|
conditioner_dim,
|
|
2 * num_filters,
|
|
filter_size=1,
|
|
dilation=1,
|
|
causal=False,
|
|
dtype=dtype)
|
|
|
|
def forward(self, x, skip=None, conditioner=None):
|
|
"""
|
|
Args:
|
|
x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU
|
|
layer, where B means batch_size, C_in means the input channels
|
|
T means input time steps.
|
|
skip (Variable): Shape(B, C_in, 1, T), skip connection.
|
|
conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
|
|
conditioner, where C_con is conditioner hidden dim which
|
|
equals the num of mel bands. Note that when using residual
|
|
connection, the Conv1D_GU does not change the number of
|
|
channels, so out channels equals input channels.
|
|
Returns:
|
|
x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where
|
|
C_out means the output channels of Conv1D_GU.
|
|
skip (Variable): Shape(B, C_out, 1, T), skip connection.
|
|
"""
|
|
residual = x
|
|
x = self.conv(x)
|
|
|
|
if conditioner is not None:
|
|
cond_bias = self.fc(conditioner)
|
|
x += cond_bias
|
|
|
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
|
|
|
# Gated Unit.
|
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
|
|
fluid.layers.tanh(content))
|
|
|
|
if skip is None:
|
|
skip = x
|
|
else:
|
|
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
|
|
|
if self.residual:
|
|
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
|
|
|
return x, skip
|
|
|
|
def add_input(self, x, skip=None, conditioner=None):
|
|
"""
|
|
Inputs:
|
|
x: shape(B, num_filters, 1, time_steps)
|
|
skip: shape(B, num_filters, 1, time_steps), skip connection
|
|
conditioner: shape(B, conditioner_dim, 1, time_steps)
|
|
Outputs:
|
|
x: shape(B, num_filters, 1, time_steps), where time_steps = 1
|
|
skip: skip connection, same shape as x
|
|
"""
|
|
residual = x
|
|
|
|
# add step input and produce step output
|
|
x = self.conv.add_input(x)
|
|
|
|
if conditioner is not None:
|
|
cond_bias = self.fc(conditioner)
|
|
x += cond_bias
|
|
|
|
content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
|
|
|
|
# Gated Unit.
|
|
x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate),
|
|
fluid.layers.tanh(content))
|
|
|
|
if skip is None:
|
|
skip = x
|
|
else:
|
|
skip = fluid.layers.scale(skip + x, np.sqrt(0.5))
|
|
|
|
if self.residual:
|
|
x = fluid.layers.scale(residual + x, np.sqrt(0.5))
|
|
|
|
return x, skip
|
|
|
|
|
|
def Conv2DTranspose(name_scope,
|
|
num_filters,
|
|
filter_size,
|
|
padding=0,
|
|
stride=1,
|
|
dilation=1,
|
|
use_cudnn=True,
|
|
act=None,
|
|
dtype="float32"):
|
|
val = 1.0 / (filter_size[0] * filter_size[1])
|
|
weight_init = fluid.initializer.ConstantInitializer(val)
|
|
weight_attr = fluid.ParamAttr(initializer=weight_init)
|
|
|
|
layer = weight_norm.Conv2DTranspose(
|
|
name_scope,
|
|
num_filters,
|
|
filter_size=filter_size,
|
|
padding=padding,
|
|
stride=stride,
|
|
dilation=dilation,
|
|
param_attr=weight_attr,
|
|
use_cudnn=use_cudnn,
|
|
act=act,
|
|
dtype=dtype)
|
|
|
|
return layer
|