add an cli for cloning examples
This commit is contained in:
parent
c7e5aaa540
commit
a01200e437
|
@ -0,0 +1,36 @@
|
|||
import parakeet
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
package_path = Path(__file__).parent
|
||||
print(package_path)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
subparser = parser.add_subparsers(dest="cmd")
|
||||
|
||||
list_exp_parser = subparser.add_parser("list-examples")
|
||||
clone = subparser.add_parser("clone-example")
|
||||
clone.add_argument("experiment_name", type=str, help="experiment name")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.cmd == "list-examples":
|
||||
print(os.listdir(package_path / "examples"))
|
||||
exit(0)
|
||||
|
||||
if args.cmd == "clone-example":
|
||||
source = package_path / "examples" / (args.experiment_name)
|
||||
target = Path(os.getcwd()) / (args.experiment_name)
|
||||
if not os.path.exists(str(source)):
|
||||
raise ValueError("{} does not exist".format(str(source)))
|
||||
|
||||
if os.path.exists(str(target)):
|
||||
raise FileExistsError("{} already exists".format(str(target)))
|
||||
|
||||
shutil.copytree(str(source), str(target))
|
||||
print("{} copied!".format(args.experiment_name))
|
||||
exit(0)
|
|
@ -1,15 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .transformer_tts import TransformerTTS
|
||||
from .vocoder import Vocoder
|
|
@ -1,287 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.customized import Pool1D, Conv1D
|
||||
from parakeet.modules.dynamic_gru import DynamicGRU
|
||||
import numpy as np
|
||||
|
||||
|
||||
class CBHG(dg.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
batch_size,
|
||||
K=16,
|
||||
projection_size=256,
|
||||
num_gru_layers=2,
|
||||
max_pool_kernel_size=2,
|
||||
is_post=False):
|
||||
"""CBHG Module
|
||||
|
||||
Args:
|
||||
hidden_size (int): dimension of hidden unit.
|
||||
batch_size (int): batch size of input.
|
||||
K (int, optional): number of convolution banks. Defaults to 16.
|
||||
projection_size (int, optional): dimension of projection unit. Defaults to 256.
|
||||
num_gru_layers (int, optional): number of layers of GRUcell. Defaults to 2.
|
||||
max_pool_kernel_size (int, optional): max pooling kernel size. Defaults to 2
|
||||
is_post (bool, optional): whether post processing or not. Defaults to False.
|
||||
"""
|
||||
super(CBHG, self).__init__()
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.projection_size = projection_size
|
||||
self.conv_list = []
|
||||
k = math.sqrt(1.0 / projection_size)
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=projection_size,
|
||||
num_filters=hidden_size,
|
||||
filter_size=1,
|
||||
padding=int(np.floor(1 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
k = math.sqrt(1.0 / hidden_size)
|
||||
for i in range(2, K + 1):
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=hidden_size,
|
||||
num_filters=hidden_size,
|
||||
filter_size=i,
|
||||
padding=int(np.floor(i / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batchnorm_list = []
|
||||
for i in range(K):
|
||||
self.batchnorm_list.append(
|
||||
dg.BatchNorm(
|
||||
hidden_size, data_layout='NCHW'))
|
||||
|
||||
for i, layer in enumerate(self.batchnorm_list):
|
||||
self.add_sublayer("batchnorm_list_{}".format(i), layer)
|
||||
|
||||
conv_outdim = hidden_size * K
|
||||
|
||||
k = math.sqrt(1.0 / conv_outdim)
|
||||
self.conv_projection_1 = Conv1D(
|
||||
num_channels=conv_outdim,
|
||||
num_filters=hidden_size,
|
||||
filter_size=3,
|
||||
padding=int(np.floor(3 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
k = math.sqrt(1.0 / hidden_size)
|
||||
self.conv_projection_2 = Conv1D(
|
||||
num_channels=hidden_size,
|
||||
num_filters=projection_size,
|
||||
filter_size=3,
|
||||
padding=int(np.floor(3 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
|
||||
self.batchnorm_proj_2 = dg.BatchNorm(
|
||||
projection_size, data_layout='NCHW')
|
||||
self.max_pool = Pool1D(
|
||||
pool_size=max_pool_kernel_size,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=1,
|
||||
data_format="NCT")
|
||||
self.highway = Highwaynet(self.projection_size)
|
||||
|
||||
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
|
||||
h_0 = dg.to_variable(h_0)
|
||||
k = math.sqrt(1.0 / hidden_size)
|
||||
self.fc_forward1 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.fc_reverse1 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.gru_forward1 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=False,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
self.gru_reverse1 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
|
||||
self.fc_forward2 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.fc_reverse2 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.gru_forward2 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=False,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
self.gru_reverse2 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
|
||||
def _conv_fit_dim(self, x, filter_size=3):
|
||||
if filter_size % 2 == 0:
|
||||
return x[:, :, :-1]
|
||||
else:
|
||||
return x
|
||||
|
||||
def forward(self, input_):
|
||||
"""
|
||||
Convert linear spectrum to Mel spectrum.
|
||||
|
||||
Args:
|
||||
input_ (Variable): shape(B, C, T), dtype float32, the sequentially input.
|
||||
|
||||
Returns:
|
||||
out (Variable): shape(B, C, T), the CBHG output.
|
||||
"""
|
||||
|
||||
conv_list = []
|
||||
conv_input = input_
|
||||
|
||||
for i, (conv, batchnorm
|
||||
) in enumerate(zip(self.conv_list, self.batchnorm_list)):
|
||||
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
|
||||
conv_input = layers.relu(batchnorm(conv_input))
|
||||
conv_list.append(conv_input)
|
||||
|
||||
conv_cat = layers.concat(conv_list, axis=1)
|
||||
conv_pool = self.max_pool(conv_cat)[:, :, :-1]
|
||||
|
||||
conv_proj = layers.relu(
|
||||
self.batchnorm_proj_1(
|
||||
self._conv_fit_dim(self.conv_projection_1(conv_pool))))
|
||||
conv_proj = self.batchnorm_proj_2(
|
||||
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
|
||||
|
||||
# conv_proj.shape = [N, C, T]
|
||||
highway = layers.transpose(conv_proj, [0, 2, 1])
|
||||
highway = self.highway(highway)
|
||||
|
||||
# highway.shape = [N, T, C]
|
||||
fc_forward = self.fc_forward1(highway)
|
||||
fc_reverse = self.fc_reverse1(highway)
|
||||
out_forward = self.gru_forward1(fc_forward)
|
||||
out_reverse = self.gru_reverse1(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
fc_forward = self.fc_forward2(out)
|
||||
fc_reverse = self.fc_reverse2(out)
|
||||
out_forward = self.gru_forward2(fc_forward)
|
||||
out_reverse = self.gru_reverse2(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
out = layers.transpose(out, [0, 2, 1])
|
||||
return out
|
||||
|
||||
|
||||
class Highwaynet(dg.Layer):
|
||||
def __init__(self, num_units, num_layers=4):
|
||||
"""Highway network
|
||||
|
||||
Args:
|
||||
num_units (int): dimension of hidden unit.
|
||||
num_layers (int, optional): number of highway layers. Defaults to 4.
|
||||
"""
|
||||
super(Highwaynet, self).__init__()
|
||||
self.num_units = num_units
|
||||
self.num_layers = num_layers
|
||||
|
||||
self.gates = []
|
||||
self.linears = []
|
||||
k = math.sqrt(1.0 / num_units)
|
||||
for i in range(num_layers):
|
||||
self.linears.append(
|
||||
dg.Linear(
|
||||
num_units,
|
||||
num_units,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
self.gates.append(
|
||||
dg.Linear(
|
||||
num_units,
|
||||
num_units,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
|
||||
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
|
||||
self.add_sublayer("linears_{}".format(i), linear)
|
||||
self.add_sublayer("gates_{}".format(i), gate)
|
||||
|
||||
def forward(self, input_):
|
||||
"""
|
||||
Compute result of Highway network.
|
||||
|
||||
Args:
|
||||
input_(Variable): shape(B, T, C), dtype float32, the sequentially input.
|
||||
|
||||
Returns:
|
||||
out(Variable): the Highway output.
|
||||
"""
|
||||
out = input_
|
||||
|
||||
for linear, gate in zip(self.linears, self.gates):
|
||||
h = fluid.layers.relu(linear(out))
|
||||
t_ = fluid.layers.sigmoid(gate(out))
|
||||
|
||||
c = 1 - t_
|
||||
out = h * t_ + out * c
|
||||
|
||||
return out
|
|
@ -1,193 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||
from parakeet.models.transformer_tts.prenet import PreNet
|
||||
from parakeet.models.transformer_tts.post_convnet import PostConvNet
|
||||
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self,
|
||||
num_hidden,
|
||||
num_mels=80,
|
||||
outputs_per_step=1,
|
||||
num_head=4,
|
||||
n_layers=3):
|
||||
"""Decoder layer of TransformerTTS.
|
||||
|
||||
Args:
|
||||
num_hidden (int): the number of source vocabulary.
|
||||
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
|
||||
outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
|
||||
num_head (int, optional): the head number of multihead attention. Defaults to 4.
|
||||
n_layers (int, optional): the layers number of multihead attention. Defaults to 3.
|
||||
"""
|
||||
super(Decoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
self.num_head = num_head
|
||||
param = fluid.ParamAttr()
|
||||
self.alpha = self.create_parameter(
|
||||
shape=(1, ),
|
||||
attr=param,
|
||||
dtype='float32',
|
||||
default_initializer=fluid.initializer.ConstantInitializer(
|
||||
value=1.0))
|
||||
self.pos_inp = get_sinusoid_encoding_table(
|
||||
1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(
|
||||
size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
self.pos_inp),
|
||||
trainable=False))
|
||||
self.decoder_prenet = PreNet(
|
||||
input_size=num_mels,
|
||||
hidden_size=num_hidden * 2,
|
||||
output_size=num_hidden,
|
||||
dropout_rate=0.2)
|
||||
k = math.sqrt(1.0 / num_hidden)
|
||||
self.linear = dg.Linear(
|
||||
num_hidden,
|
||||
num_hidden,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
self.selfattn_layers = [
|
||||
MultiheadAttention(num_hidden, num_hidden // num_head,
|
||||
num_hidden // num_head) for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.selfattn_layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.attn_layers = [
|
||||
MultiheadAttention(num_hidden, num_hidden // num_head,
|
||||
num_hidden // num_head) for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.attn_layers):
|
||||
self.add_sublayer("attn_{}".format(i), layer)
|
||||
self.ffns = [
|
||||
PositionwiseFeedForward(
|
||||
num_hidden, num_hidden * num_head, filter_size=1)
|
||||
for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
self.mel_linear = dg.Linear(
|
||||
num_hidden,
|
||||
num_mels * outputs_per_step,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.stop_linear = dg.Linear(
|
||||
num_hidden,
|
||||
1,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
self.postconvnet = PostConvNet(
|
||||
num_mels,
|
||||
num_hidden,
|
||||
filter_size=5,
|
||||
padding=4,
|
||||
num_conv=5,
|
||||
outputs_per_step=outputs_per_step,
|
||||
use_cudnn=True)
|
||||
|
||||
def forward(self, key, value, query, positional, c_mask):
|
||||
"""
|
||||
Compute decoder outputs.
|
||||
|
||||
Args:
|
||||
key (Variable): shape(B, T_text, C), dtype float32, the input key of decoder,
|
||||
where T_text means the timesteps of input text,
|
||||
value (Variable): shape(B, T_text, C), dtype float32, the input value of decoder.
|
||||
query (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder,
|
||||
where T_mel means the timesteps of input spectrum,
|
||||
positional (Variable): shape(B, T_mel), dtype int64, the spectrum position.
|
||||
c_mask (Variable): shape(B, T_text, 1), dtype float32, query mask returned from encoder.
|
||||
Returns:
|
||||
mel_out (Variable): shape(B, T_mel, C), the decoder output after mel linear projection.
|
||||
out (Variable): shape(B, T_mel, C), the decoder output after post mel network.
|
||||
stop_tokens (Variable): shape(B, T_mel, 1), the stop tokens of output.
|
||||
attn_list (list[Variable]): len(n_layers), the encoder-decoder attention list.
|
||||
selfattn_list (list[Variable]): len(n_layers), the decoder self attention list.
|
||||
"""
|
||||
|
||||
# get decoder mask with triangular matrix
|
||||
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
mask = get_dec_attn_key_pad_mask(positional, self.num_head,
|
||||
query.dtype)
|
||||
m_mask = get_non_pad_mask(positional, self.num_head, query.dtype)
|
||||
zero_mask = layers.cast(c_mask == 0, dtype=query.dtype) * -1e30
|
||||
zero_mask = layers.transpose(zero_mask, perm=[0, 2, 1])
|
||||
|
||||
else:
|
||||
len_q = query.shape[1]
|
||||
mask = layers.triu(
|
||||
layers.ones(
|
||||
shape=[len_q, len_q], dtype=query.dtype),
|
||||
diagonal=1)
|
||||
mask = layers.cast(mask != 0, dtype=query.dtype) * -1e30
|
||||
m_mask, zero_mask = None, None
|
||||
|
||||
# Decoder pre-network
|
||||
query = self.decoder_prenet(query)
|
||||
|
||||
# Centered position
|
||||
query = self.linear(query)
|
||||
|
||||
# Get position embedding
|
||||
positional = self.pos_emb(positional)
|
||||
query = positional * self.alpha + query
|
||||
|
||||
#positional dropout
|
||||
query = fluid.layers.dropout(
|
||||
query, 0.1, dropout_implementation='upscale_in_train')
|
||||
|
||||
# Attention decoder-decoder, encoder-decoder
|
||||
selfattn_list = list()
|
||||
attn_list = list()
|
||||
|
||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
|
||||
self.ffns):
|
||||
query, attn_dec = selfattn(
|
||||
query, query, query, mask=mask, query_mask=m_mask)
|
||||
query, attn_dot = attn(
|
||||
key, value, query, mask=zero_mask, query_mask=m_mask)
|
||||
query = ffn(query)
|
||||
selfattn_list.append(attn_dec)
|
||||
attn_list.append(attn_dot)
|
||||
|
||||
# Mel linear projection
|
||||
mel_out = self.mel_linear(query)
|
||||
# Post Mel Network
|
||||
out = self.postconvnet(mel_out)
|
||||
out = mel_out + out
|
||||
|
||||
# Stop tokens
|
||||
stop_tokens = self.stop_linear(query)
|
||||
stop_tokens = layers.squeeze(stop_tokens, [-1])
|
||||
stop_tokens = layers.sigmoid(stop_tokens)
|
||||
|
||||
return mel_out, out, attn_list, stop_tokens, selfattn_list
|
|
@ -1,106 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
|
||||
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3):
|
||||
"""Encoder layer of TransformerTTS.
|
||||
|
||||
Args:
|
||||
embedding_size (int): the size of position embedding.
|
||||
num_hidden (int): the size of hidden layer in network.
|
||||
num_head (int, optional): the head number of multihead attention. Defaults to 4.
|
||||
n_layers (int, optional): the layers number of multihead attention. Defaults to 3.
|
||||
"""
|
||||
super(Encoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
self.num_head = num_head
|
||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
|
||||
value=1.0))
|
||||
self.alpha = self.create_parameter(
|
||||
shape=(1, ), attr=param, dtype='float32')
|
||||
self.pos_inp = get_sinusoid_encoding_table(
|
||||
1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(
|
||||
size=[1024, num_hidden],
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
self.pos_inp),
|
||||
trainable=False))
|
||||
self.encoder_prenet = EncoderPrenet(
|
||||
embedding_size=embedding_size,
|
||||
num_hidden=num_hidden,
|
||||
use_cudnn=True)
|
||||
self.layers = [
|
||||
MultiheadAttention(num_hidden, num_hidden // num_head,
|
||||
num_hidden // num_head) for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.ffns = [
|
||||
PositionwiseFeedForward(
|
||||
num_hidden,
|
||||
num_hidden * num_head,
|
||||
filter_size=1,
|
||||
use_cudnn=True) for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
|
||||
def forward(self, x, positional):
|
||||
"""
|
||||
Encode text sequence.
|
||||
|
||||
Args:
|
||||
x (Variable): shape(B, T_text), dtype float32, the input character,
|
||||
where T_text means the timesteps of input text,
|
||||
positional (Variable): shape(B, T_text), dtype int64, the characters position.
|
||||
|
||||
Returns:
|
||||
x (Variable): shape(B, T_text, C), the encoder output.
|
||||
attentions (list[Variable]): len(n_layers), the encoder self attention list.
|
||||
"""
|
||||
|
||||
# Encoder pre_network
|
||||
x = self.encoder_prenet(x)
|
||||
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
mask = get_attn_key_pad_mask(positional, self.num_head, x.dtype)
|
||||
query_mask = get_non_pad_mask(positional, self.num_head, x.dtype)
|
||||
|
||||
else:
|
||||
query_mask, mask = None, None
|
||||
|
||||
# Get positional encoding
|
||||
positional = self.pos_emb(positional)
|
||||
|
||||
x = positional * self.alpha + x
|
||||
|
||||
# Positional dropout
|
||||
x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train')
|
||||
|
||||
# Self attention encoder
|
||||
attentions = list()
|
||||
for layer, ffn in zip(self.layers, self.ffns):
|
||||
x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
|
||||
x = ffn(x)
|
||||
attentions.append(attention)
|
||||
|
||||
return x, attentions, query_mask
|
|
@ -1,111 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.customized import Conv1D
|
||||
import numpy as np
|
||||
|
||||
|
||||
class EncoderPrenet(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
|
||||
""" Encoder prenet layer of TransformerTTS.
|
||||
|
||||
Args:
|
||||
embedding_size (int): the size of embedding.
|
||||
num_hidden (int): the size of hidden layer in network.
|
||||
use_cudnn (bool, optional): use cudnn or not. Defaults to True.
|
||||
"""
|
||||
super(EncoderPrenet, self).__init__()
|
||||
self.embedding_size = embedding_size
|
||||
self.num_hidden = num_hidden
|
||||
self.use_cudnn = use_cudnn
|
||||
self.embedding = dg.Embedding(
|
||||
size=[len(symbols), embedding_size],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.initializer.Normal(
|
||||
loc=0.0, scale=1.0))
|
||||
self.conv_list = []
|
||||
k = math.sqrt(1.0 / embedding_size)
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=embedding_size,
|
||||
num_filters=num_hidden,
|
||||
filter_size=5,
|
||||
padding=int(np.floor(5 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
k = math.sqrt(1.0 / num_hidden)
|
||||
for _ in range(2):
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=num_hidden,
|
||||
filter_size=5,
|
||||
padding=int(np.floor(5 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [
|
||||
dg.BatchNorm(
|
||||
num_hidden, data_layout='NCHW') for _ in range(3)
|
||||
]
|
||||
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
k = math.sqrt(1.0 / num_hidden)
|
||||
self.projection = dg.Linear(
|
||||
num_hidden,
|
||||
num_hidden,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Prepare encoder input.
|
||||
|
||||
Args:
|
||||
x (Variable): shape(B, T_text), dtype float32, the input character, where T_text means the timesteps of input text.
|
||||
|
||||
Returns:
|
||||
(Variable): shape(B, T_text, C), the encoder prenet output.
|
||||
"""
|
||||
|
||||
x = self.embedding(x)
|
||||
x = layers.transpose(x, [0, 2, 1])
|
||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||
x = layers.dropout(
|
||||
layers.relu(batch_norm(conv(x))),
|
||||
0.2,
|
||||
dropout_implementation='upscale_in_train')
|
||||
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
|
||||
x = self.projection(x)
|
||||
|
||||
return x
|
|
@ -1,137 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.customized import Conv1D
|
||||
|
||||
|
||||
class PostConvNet(dg.Layer):
|
||||
def __init__(self,
|
||||
n_mels=80,
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
padding=0,
|
||||
num_conv=5,
|
||||
outputs_per_step=1,
|
||||
use_cudnn=True,
|
||||
dropout=0.1,
|
||||
batchnorm_last=False):
|
||||
"""Decocder post conv net of TransformerTTS.
|
||||
|
||||
Args:
|
||||
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
|
||||
num_hidden (int, optional): the size of hidden layer in network. Defaults to 512.
|
||||
filter_size (int, optional): the filter size of Conv. Defaults to 5.
|
||||
padding (int, optional): the padding size of Conv. Defaults to 0.
|
||||
num_conv (int, optional): the num of Conv layers in network. Defaults to 5.
|
||||
outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
|
||||
use_cudnn (bool, optional): use cudnn in Conv or not. Defaults to True.
|
||||
dropout (float, optional): dropout probability. Defaults to 0.1.
|
||||
batchnorm_last (bool, optional): if batchnorm at last layer or not. Defaults to False.
|
||||
"""
|
||||
super(PostConvNet, self).__init__()
|
||||
|
||||
self.dropout = dropout
|
||||
self.num_conv = num_conv
|
||||
self.batchnorm_last = batchnorm_last
|
||||
self.conv_list = []
|
||||
k = math.sqrt(1.0 / (n_mels * outputs_per_step))
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=n_mels * outputs_per_step,
|
||||
num_filters=num_hidden,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
k = math.sqrt(1.0 / num_hidden)
|
||||
for _ in range(1, num_conv - 1):
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=num_hidden,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=n_mels * outputs_per_step,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [
|
||||
dg.BatchNorm(
|
||||
num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
|
||||
]
|
||||
if self.batchnorm_last:
|
||||
self.batch_norm_list.append(
|
||||
dg.BatchNorm(
|
||||
n_mels * outputs_per_step, data_layout='NCHW'))
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
Compute the mel spectrum.
|
||||
|
||||
Args:
|
||||
input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection.
|
||||
|
||||
Returns:
|
||||
output (Variable): shape(B, T, C), the result after postconvnet.
|
||||
"""
|
||||
|
||||
input = layers.transpose(input, [0, 2, 1])
|
||||
len = input.shape[-1]
|
||||
for i in range(self.num_conv - 1):
|
||||
batch_norm = self.batch_norm_list[i]
|
||||
conv = self.conv_list[i]
|
||||
|
||||
input = layers.dropout(
|
||||
layers.tanh(batch_norm(conv(input)[:, :, :len])),
|
||||
self.dropout,
|
||||
dropout_implementation='upscale_in_train')
|
||||
conv = self.conv_list[self.num_conv - 1]
|
||||
input = conv(input)[:, :, :len]
|
||||
if self.batchnorm_last:
|
||||
batch_norm = self.batch_norm_list[self.num_conv - 1]
|
||||
input = layers.dropout(
|
||||
batch_norm(input),
|
||||
self.dropout,
|
||||
dropout_implementation='upscale_in_train')
|
||||
output = layers.transpose(input, [0, 2, 1])
|
||||
return output
|
|
@ -1,71 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
class PreNet(dg.Layer):
|
||||
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
|
||||
"""Prenet before passing through the network.
|
||||
|
||||
Args:
|
||||
input_size (int): the input channel size.
|
||||
hidden_size (int): the size of hidden layer in network.
|
||||
output_size (int): the output channel size.
|
||||
dropout_rate (float, optional): dropout probability. Defaults to 0.2.
|
||||
"""
|
||||
super(PreNet, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.dropout_rate = dropout_rate
|
||||
|
||||
k = math.sqrt(1.0 / input_size)
|
||||
self.linear1 = dg.Linear(
|
||||
input_size,
|
||||
hidden_size,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
k = math.sqrt(1.0 / hidden_size)
|
||||
self.linear2 = dg.Linear(
|
||||
hidden_size,
|
||||
output_size,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Prepare network input.
|
||||
|
||||
Args:
|
||||
x (Variable): shape(B, T, C), dtype float32, the input value.
|
||||
|
||||
Returns:
|
||||
output (Variable): shape(B, T, C), the result after pernet.
|
||||
"""
|
||||
x = layers.dropout(
|
||||
layers.relu(self.linear1(x)),
|
||||
self.dropout_rate,
|
||||
dropout_implementation='upscale_in_train')
|
||||
output = layers.dropout(
|
||||
layers.relu(self.linear2(x)),
|
||||
self.dropout_rate,
|
||||
dropout_implementation='upscale_in_train')
|
||||
return output
|
|
@ -1,71 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.encoder import Encoder
|
||||
from parakeet.models.transformer_tts.decoder import Decoder
|
||||
|
||||
|
||||
class TransformerTTS(dg.Layer):
|
||||
def __init__(self,
|
||||
embedding_size,
|
||||
num_hidden,
|
||||
encoder_num_head=4,
|
||||
encoder_n_layers=3,
|
||||
n_mels=80,
|
||||
outputs_per_step=1,
|
||||
decoder_num_head=4,
|
||||
decoder_n_layers=3):
|
||||
"""TransformerTTS model.
|
||||
|
||||
Args:
|
||||
embedding_size (int): the size of position embedding.
|
||||
num_hidden (int): the size of hidden layer in network.
|
||||
encoder_num_head (int, optional): the head number of multihead attention in encoder. Defaults to 4.
|
||||
encoder_n_layers (int, optional): the layers number of multihead attention in encoder. Defaults to 3.
|
||||
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
|
||||
outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
|
||||
decoder_num_head (int, optional): the head number of multihead attention in decoder. Defaults to 4.
|
||||
decoder_n_layers (int, optional): the layers number of multihead attention in decoder. Defaults to 3.
|
||||
"""
|
||||
super(TransformerTTS, self).__init__()
|
||||
self.encoder = Encoder(embedding_size, num_hidden, encoder_num_head,
|
||||
encoder_n_layers)
|
||||
self.decoder = Decoder(num_hidden, n_mels, outputs_per_step,
|
||||
decoder_num_head, decoder_n_layers)
|
||||
|
||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
||||
"""
|
||||
TransformerTTS network.
|
||||
|
||||
Args:
|
||||
characters (Variable): shape(B, T_text), dtype float32, the input character,
|
||||
where T_text means the timesteps of input text,
|
||||
mel_input (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder,
|
||||
where T_mel means the timesteps of input spectrum,
|
||||
pos_text (Variable): shape(B, T_text), dtype int64, the characters position.
|
||||
|
||||
Returns:
|
||||
mel_output (Variable): shape(B, T_mel, C), the decoder output after mel linear projection.
|
||||
postnet_output (Variable): shape(B, T_mel, C), the decoder output after post mel network.
|
||||
stop_preds (Variable): shape(B, T_mel, 1), the stop tokens of output.
|
||||
attn_probs (list[Variable]): len(n_layers), the encoder-decoder attention list.
|
||||
attns_enc (list[Variable]): len(n_layers), the encoder self attention list.
|
||||
attns_dec (list[Variable]): len(n_layers), the decoder self attention list.
|
||||
"""
|
||||
key, attns_enc, query_mask = self.encoder(characters, pos_text)
|
||||
|
||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
|
||||
key, key, mel_input, pos_mel, query_mask)
|
||||
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
|
@ -1,101 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import librosa
|
||||
import os, copy
|
||||
from scipy import signal
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
def get_positional_table(d_pos_vec, n_position=1024):
|
||||
position_enc = np.array(
|
||||
[[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
|
||||
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
||||
|
||||
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
||||
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
||||
return position_enc
|
||||
|
||||
|
||||
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
||||
''' Sinusoid position encoding table '''
|
||||
|
||||
def cal_angle(position, hid_idx):
|
||||
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
|
||||
|
||||
def get_posi_angle_vec(position):
|
||||
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
|
||||
|
||||
sinusoid_table = np.array(
|
||||
[get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
|
||||
if padding_idx is not None:
|
||||
# zero vector for padding dimension
|
||||
sinusoid_table[padding_idx] = 0.
|
||||
|
||||
return sinusoid_table
|
||||
|
||||
|
||||
def get_non_pad_mask(seq, num_head, dtype):
|
||||
mask = layers.cast(seq != 0, dtype=dtype)
|
||||
mask = layers.unsqueeze(mask, axes=[-1])
|
||||
mask = layers.expand(mask, [num_head, 1, 1])
|
||||
return mask
|
||||
|
||||
|
||||
def get_attn_key_pad_mask(seq_k, num_head, dtype):
|
||||
''' For masking out the padding part of key sequence. '''
|
||||
# Expand to fit the shape of key query attention matrix.
|
||||
padding_mask = layers.cast(seq_k == 0, dtype=dtype) * -1e30
|
||||
padding_mask = layers.unsqueeze(padding_mask, axes=[1])
|
||||
padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
|
||||
return padding_mask
|
||||
|
||||
|
||||
def get_dec_attn_key_pad_mask(seq_k, num_head, dtype):
|
||||
''' For masking out the padding part of key sequence. '''
|
||||
|
||||
# Expand to fit the shape of key query attention matrix.
|
||||
padding_mask = layers.cast(seq_k == 0, dtype=dtype)
|
||||
padding_mask = layers.unsqueeze(padding_mask, axes=[1])
|
||||
len_k = seq_k.shape[1]
|
||||
triu = layers.triu(
|
||||
layers.ones(
|
||||
shape=[len_k, len_k], dtype=dtype), diagonal=1)
|
||||
padding_mask = padding_mask + triu
|
||||
padding_mask = layers.cast(
|
||||
padding_mask != 0, dtype=dtype) * -1e30 #* (-2**32 + 1)
|
||||
padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
|
||||
return padding_mask
|
||||
|
||||
|
||||
def guided_attention(N, T, g=0.2):
|
||||
'''Guided attention. Refer to page 3 on the paper.'''
|
||||
W = np.zeros((N, T), dtype=np.float32)
|
||||
for n_pos in range(W.shape[0]):
|
||||
for t_pos in range(W.shape[1]):
|
||||
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
|
||||
**2 / (2 * g * g))
|
||||
return W
|
||||
|
||||
|
||||
def cross_entropy(input, label, weight=1.0, epsilon=1e-30):
|
||||
output = -1 * label * layers.log(input + epsilon) - (
|
||||
1 - label) * layers.log(1 - input + epsilon)
|
||||
output = output * (label * (weight - 1) + 1)
|
||||
|
||||
return layers.reduce_mean(output, dim=[0, 1])
|
|
@ -1,55 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.customized import Conv1D
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.transformer_tts.cbhg import CBHG
|
||||
|
||||
|
||||
class Vocoder(dg.Layer):
|
||||
def __init__(self, batch_size, hidden_size, num_mels=80, n_fft=2048):
|
||||
"""CBHG Network (mel -> linear)
|
||||
|
||||
Args:
|
||||
batch_size (int): the batch size of input.
|
||||
hidden_size (int): the size of hidden layer in network.
|
||||
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
|
||||
n_fft (int, optional): length of the windowed signal after padding with zeros. Defaults to 2048.
|
||||
"""
|
||||
super(Vocoder, self).__init__()
|
||||
self.pre_proj = Conv1D(
|
||||
num_channels=num_mels, num_filters=hidden_size, filter_size=1)
|
||||
self.cbhg = CBHG(hidden_size, batch_size)
|
||||
self.post_proj = Conv1D(
|
||||
num_channels=hidden_size,
|
||||
num_filters=(n_fft // 2) + 1,
|
||||
filter_size=1)
|
||||
|
||||
def forward(self, mel):
|
||||
"""
|
||||
Compute mel spectrum to linear spectrum.
|
||||
|
||||
Args:
|
||||
mel (Variable): shape(B, C, T), dtype float32, the input mel spectrum.
|
||||
|
||||
Returns:
|
||||
mag_pred (Variable): shape(B, T, C), the linear output.
|
||||
"""
|
||||
mel = layers.transpose(mel, [0, 2, 1])
|
||||
mel = self.pre_proj(mel)
|
||||
mel = self.cbhg(mel)
|
||||
mag_pred = self.post_proj(mel)
|
||||
mag_pred = layers.transpose(mag_pred, [0, 2, 1])
|
||||
return mag_pred
|
|
@ -1,15 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule
|
|
@ -1,443 +0,0 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
from parakeet.modules import weight_norm
|
||||
|
||||
|
||||
def get_param_attr(layer_type, filter_size, c_in=1):
|
||||
if layer_type == "weight_norm":
|
||||
k = np.sqrt(1.0 / (c_in * np.prod(filter_size)))
|
||||
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
||||
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
||||
elif layer_type == "common":
|
||||
weight_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
else:
|
||||
raise TypeError("Unsupported layer type.")
|
||||
|
||||
param_attr = fluid.ParamAttr(initializer=weight_init)
|
||||
bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||
return param_attr, bias_attr
|
||||
|
||||
|
||||
def unfold(x, n_group):
|
||||
length = x.shape[-1]
|
||||
new_shape = x.shape[:-1] + [length // n_group, n_group]
|
||||
return fluid.layers.reshape(x, new_shape)
|
||||
|
||||
|
||||
class WaveFlowLoss:
|
||||
def __init__(self, sigma=1.0):
|
||||
self.sigma = sigma
|
||||
|
||||
def __call__(self, model_output):
|
||||
z, log_s_list = model_output
|
||||
for i, log_s in enumerate(log_s_list):
|
||||
if i == 0:
|
||||
log_s_total = fluid.layers.reduce_sum(log_s)
|
||||
else:
|
||||
log_s_total = log_s_total + fluid.layers.reduce_sum(log_s)
|
||||
|
||||
loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \
|
||||
- log_s_total
|
||||
loss = loss / np.prod(z.shape)
|
||||
const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
|
||||
|
||||
return loss + const
|
||||
|
||||
|
||||
class Conditioner(dg.Layer):
|
||||
def __init__(self, dtype, upsample_factors):
|
||||
super(Conditioner, self).__init__()
|
||||
|
||||
self.upsample_conv2d = []
|
||||
for s in upsample_factors:
|
||||
in_channel = 1
|
||||
param_attr, bias_attr = get_param_attr(
|
||||
"weight_norm", (3, 2 * s), c_in=in_channel)
|
||||
conv_trans2d = weight_norm.Conv2DTranspose(
|
||||
num_channels=in_channel,
|
||||
num_filters=1,
|
||||
filter_size=(3, 2 * s),
|
||||
padding=(1, s // 2),
|
||||
stride=(1, s),
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
dtype=dtype)
|
||||
self.upsample_conv2d.append(conv_trans2d)
|
||||
|
||||
for i, layer in enumerate(self.upsample_conv2d):
|
||||
self.add_sublayer("conv2d_transpose_{}".format(i), layer)
|
||||
|
||||
def forward(self, x):
|
||||
x = fluid.layers.unsqueeze(x, 1)
|
||||
for layer in self.upsample_conv2d:
|
||||
x = layer(x)
|
||||
x = fluid.layers.leaky_relu(x, alpha=0.4)
|
||||
|
||||
return fluid.layers.squeeze(x, [1])
|
||||
|
||||
def infer(self, x):
|
||||
x = fluid.layers.unsqueeze(x, 1)
|
||||
for layer in self.upsample_conv2d:
|
||||
x = layer(x)
|
||||
# Trim conv artifacts.
|
||||
time_cutoff = layer._filter_size[1] - layer._stride[1]
|
||||
x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4)
|
||||
|
||||
return fluid.layers.squeeze(x, [1])
|
||||
|
||||
|
||||
class Flow(dg.Layer):
|
||||
def __init__(self, config):
|
||||
super(Flow, self).__init__()
|
||||
self.n_layers = config.n_layers
|
||||
self.n_channels = config.n_channels
|
||||
self.kernel_h = config.kernel_h
|
||||
self.kernel_w = config.kernel_w
|
||||
self.dtype = "float16" if config.use_fp16 else "float32"
|
||||
|
||||
# Transform audio: [batch, 1, n_group, time/n_group]
|
||||
# => [batch, n_channels, n_group, time/n_group]
|
||||
param_attr, bias_attr = get_param_attr("weight_norm", (1, 1), c_in=1)
|
||||
self.start = weight_norm.Conv2D(
|
||||
num_channels=1,
|
||||
num_filters=self.n_channels,
|
||||
filter_size=(1, 1),
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
dtype=self.dtype)
|
||||
|
||||
# Initializing last layer to 0 makes the affine coupling layers
|
||||
# do nothing at first. This helps with training stability
|
||||
# output shape: [batch, 2, n_group, time/n_group]
|
||||
param_attr, bias_attr = get_param_attr(
|
||||
"common", (1, 1), c_in=self.n_channels)
|
||||
self.end = dg.Conv2D(
|
||||
num_channels=self.n_channels,
|
||||
num_filters=2,
|
||||
filter_size=(1, 1),
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
dtype=self.dtype)
|
||||
|
||||
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
|
||||
dilation_dict = {
|
||||
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
||||
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
||||
128: [1, 2, 4, 8, 16, 32, 64, 1]
|
||||
}
|
||||
self.dilation_h_list = dilation_dict[config.n_group]
|
||||
|
||||
self.in_layers = []
|
||||
self.cond_layers = []
|
||||
self.res_skip_layers = []
|
||||
for i in range(self.n_layers):
|
||||
dilation_h = self.dilation_h_list[i]
|
||||
dilation_w = 2**i
|
||||
|
||||
param_attr, bias_attr = get_param_attr(
|
||||
"weight_norm", (self.kernel_h, self.kernel_w),
|
||||
c_in=self.n_channels)
|
||||
in_layer = weight_norm.Conv2D(
|
||||
num_channels=self.n_channels,
|
||||
num_filters=2 * self.n_channels,
|
||||
filter_size=(self.kernel_h, self.kernel_w),
|
||||
dilation=(dilation_h, dilation_w),
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
dtype=self.dtype)
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
param_attr, bias_attr = get_param_attr(
|
||||
"weight_norm", (1, 1), c_in=config.mel_bands)
|
||||
cond_layer = weight_norm.Conv2D(
|
||||
num_channels=config.mel_bands,
|
||||
num_filters=2 * self.n_channels,
|
||||
filter_size=(1, 1),
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
dtype=self.dtype)
|
||||
self.cond_layers.append(cond_layer)
|
||||
|
||||
if i < self.n_layers - 1:
|
||||
res_skip_channels = 2 * self.n_channels
|
||||
else:
|
||||
res_skip_channels = self.n_channels
|
||||
param_attr, bias_attr = get_param_attr(
|
||||
"weight_norm", (1, 1), c_in=self.n_channels)
|
||||
res_skip_layer = weight_norm.Conv2D(
|
||||
num_channels=self.n_channels,
|
||||
num_filters=res_skip_channels,
|
||||
filter_size=(1, 1),
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
dtype=self.dtype)
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
self.add_sublayer("in_layer_{}".format(i), in_layer)
|
||||
self.add_sublayer("cond_layer_{}".format(i), cond_layer)
|
||||
self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer)
|
||||
|
||||
def forward(self, audio, mel):
|
||||
# audio: [bs, 1, n_group, time/group]
|
||||
# mel: [bs, mel_bands, n_group, time/n_group]
|
||||
audio = self.start(audio)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
dilation_h = self.dilation_h_list[i]
|
||||
dilation_w = 2**i
|
||||
|
||||
# Pad height dim (n_group): causal convolution
|
||||
# Pad width dim (time): dialated non-causal convolution
|
||||
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
|
||||
pad_left = pad_right = int((self.kernel_w - 1) * dilation_w / 2)
|
||||
# Using pad2d is a bit faster than using padding in Conv2D directly
|
||||
audio_pad = fluid.layers.pad2d(
|
||||
audio, paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
||||
hidden = self.in_layers[i](audio_pad)
|
||||
cond_hidden = self.cond_layers[i](mel)
|
||||
in_acts = hidden + cond_hidden
|
||||
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
|
||||
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
|
||||
res_skip_acts = self.res_skip_layers[i](out_acts)
|
||||
|
||||
if i < self.n_layers - 1:
|
||||
audio += res_skip_acts[:, :self.n_channels, :, :]
|
||||
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output += skip_acts
|
||||
|
||||
return self.end(output)
|
||||
|
||||
def infer(self, audio, mel, queues):
|
||||
audio = self.start(audio)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
dilation_h = self.dilation_h_list[i]
|
||||
dilation_w = 2**i
|
||||
|
||||
state_size = dilation_h * (self.kernel_h - 1)
|
||||
queue = queues[i]
|
||||
|
||||
if len(queue) == 0:
|
||||
for j in range(state_size):
|
||||
queue.append(fluid.layers.zeros_like(audio))
|
||||
|
||||
state = queue[0:state_size]
|
||||
state = fluid.layers.concat(state + [audio], axis=2)
|
||||
|
||||
queue.pop(0)
|
||||
queue.append(audio)
|
||||
|
||||
# Pad height dim (n_group): causal convolution
|
||||
# Pad width dim (time): dialated non-causal convolution
|
||||
pad_top, pad_bottom = 0, 0
|
||||
pad_left = int((self.kernel_w - 1) * dilation_w / 2)
|
||||
pad_right = int((self.kernel_w - 1) * dilation_w / 2)
|
||||
state = fluid.layers.pad2d(
|
||||
state, paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
||||
hidden = self.in_layers[i](state)
|
||||
cond_hidden = self.cond_layers[i](mel)
|
||||
in_acts = hidden + cond_hidden
|
||||
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
|
||||
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
|
||||
res_skip_acts = self.res_skip_layers[i](out_acts)
|
||||
|
||||
if i < self.n_layers - 1:
|
||||
audio += res_skip_acts[:, :self.n_channels, :, :]
|
||||
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output += skip_acts
|
||||
|
||||
return self.end(output)
|
||||
|
||||
|
||||
class WaveFlowModule(dg.Layer):
|
||||
"""WaveFlow model implementation.
|
||||
|
||||
Args:
|
||||
config (obj): model configuration parameters.
|
||||
|
||||
Returns:
|
||||
WaveFlowModule
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(WaveFlowModule, self).__init__()
|
||||
self.n_flows = config.n_flows
|
||||
self.n_group = config.n_group
|
||||
self.n_layers = config.n_layers
|
||||
self.upsample_factors = config.upsample_factors if hasattr(
|
||||
config, "upsample_factors") else [16, 16]
|
||||
assert self.n_group % 2 == 0
|
||||
assert self.n_flows % 2 == 0
|
||||
|
||||
self.dtype = "float16" if config.use_fp16 else "float32"
|
||||
self.conditioner = Conditioner(self.dtype, self.upsample_factors)
|
||||
self.flows = []
|
||||
for i in range(self.n_flows):
|
||||
flow = Flow(config)
|
||||
self.flows.append(flow)
|
||||
self.add_sublayer("flow_{}".format(i), flow)
|
||||
|
||||
self.perms = []
|
||||
half = self.n_group // 2
|
||||
for i in range(self.n_flows):
|
||||
perm = list(range(self.n_group))
|
||||
if i < self.n_flows // 2:
|
||||
perm = perm[::-1]
|
||||
else:
|
||||
perm[:half] = reversed(perm[:half])
|
||||
perm[half:] = reversed(perm[half:])
|
||||
self.perms.append(perm)
|
||||
|
||||
def forward(self, audio, mel):
|
||||
"""Training forward pass.
|
||||
|
||||
Use a conditioner to upsample mel spectrograms into hidden states.
|
||||
These hidden states along with the audio are passed to a stack of Flow
|
||||
modules to obtain the final latent variable z and a list of log scaling
|
||||
variables, which are then passed to the WaveFlowLoss module to calculate
|
||||
the negative log likelihood.
|
||||
|
||||
Args:
|
||||
audio (obj): audio samples.
|
||||
mel (obj): mel spectrograms.
|
||||
|
||||
Returns:
|
||||
z (obj): latent variable.
|
||||
log_s_list(list): list of log scaling variables.
|
||||
"""
|
||||
mel = self.conditioner(mel)
|
||||
assert mel.shape[2] >= audio.shape[1]
|
||||
# Prune out the tail of audio/mel so that time/n_group == 0.
|
||||
pruned_len = int(audio.shape[1] // self.n_group * self.n_group)
|
||||
|
||||
if audio.shape[1] > pruned_len:
|
||||
audio = audio[:, :pruned_len]
|
||||
if mel.shape[2] > pruned_len:
|
||||
mel = mel[:, :, :pruned_len]
|
||||
|
||||
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
|
||||
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
|
||||
# From [bs, time] to [bs, n_group, time/n_group]
|
||||
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
|
||||
# [bs, 1, n_group, time/n_group]
|
||||
audio = fluid.layers.unsqueeze(audio, 1)
|
||||
log_s_list = []
|
||||
for i in range(self.n_flows):
|
||||
inputs = audio[:, :, :-1, :]
|
||||
conds = mel[:, :, 1:, :]
|
||||
outputs = self.flows[i](inputs, conds)
|
||||
log_s = outputs[:, :1, :, :]
|
||||
b = outputs[:, 1:, :, :]
|
||||
log_s_list.append(log_s)
|
||||
|
||||
audio_0 = audio[:, :, :1, :]
|
||||
audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b
|
||||
audio = fluid.layers.concat([audio_0, audio_out], axis=2)
|
||||
|
||||
# Permute over the height dim.
|
||||
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
|
||||
audio = fluid.layers.stack(audio_slices, axis=2)
|
||||
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
|
||||
mel = fluid.layers.stack(mel_slices, axis=2)
|
||||
|
||||
z = fluid.layers.squeeze(audio, [1])
|
||||
return z, log_s_list
|
||||
|
||||
def synthesize(self, mel, sigma=1.0):
|
||||
"""Use model to synthesize waveform.
|
||||
|
||||
Use a conditioner to upsample mel spectrograms into hidden states.
|
||||
These hidden states along with initial random gaussian latent variable
|
||||
are passed to a stack of Flow modules to obtain the audio output.
|
||||
|
||||
Note that we use convolutional queue (https://arxiv.org/abs/1611.09482)
|
||||
to cache the intermediate hidden states, which will speed up the
|
||||
autoregressive inference over the height dimension. Current
|
||||
implementation only supports height dimension (self.n_group) equals
|
||||
8 or 16, i.e., where there is no dilation on the height dimension.
|
||||
|
||||
Args:
|
||||
mel (obj): mel spectrograms.
|
||||
sigma (float, optional): standard deviation of the guassian latent
|
||||
variable. Defaults to 1.0.
|
||||
|
||||
Returns:
|
||||
audio (obj): synthesized audio.
|
||||
"""
|
||||
if self.dtype == "float16":
|
||||
mel = fluid.layers.cast(mel, self.dtype)
|
||||
mel = self.conditioner.infer(mel)
|
||||
# Prune out the tail of mel so that time/n_group == 0.
|
||||
pruned_len = int(mel.shape[2] // self.n_group * self.n_group)
|
||||
if mel.shape[2] > pruned_len:
|
||||
mel = mel[:, :, :pruned_len]
|
||||
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
|
||||
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
|
||||
|
||||
audio = fluid.layers.gaussian_random(
|
||||
shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma)
|
||||
if self.dtype == "float16":
|
||||
audio = fluid.layers.cast(audio, self.dtype)
|
||||
for i in reversed(range(self.n_flows)):
|
||||
# Permute over the height dimension.
|
||||
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
|
||||
audio = fluid.layers.stack(audio_slices, axis=2)
|
||||
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
|
||||
mel = fluid.layers.stack(mel_slices, axis=2)
|
||||
|
||||
audio_list = []
|
||||
audio_0 = audio[:, :, 0:1, :]
|
||||
audio_list.append(audio_0)
|
||||
audio_h = audio_0
|
||||
queues = [[] for _ in range(self.n_layers)]
|
||||
|
||||
for h in range(1, self.n_group):
|
||||
inputs = audio_h
|
||||
conds = mel[:, :, h:(h + 1), :]
|
||||
outputs = self.flows[i].infer(inputs, conds, queues)
|
||||
|
||||
log_s = outputs[:, 0:1, :, :]
|
||||
b = outputs[:, 1:, :, :]
|
||||
audio_h = (audio[:, :, h:(h+1), :] - b) / \
|
||||
fluid.layers.exp(log_s)
|
||||
audio_list.append(audio_h)
|
||||
|
||||
audio = fluid.layers.concat(audio_list, axis=2)
|
||||
|
||||
# audio: [bs, n_group, time/n_group]
|
||||
audio = fluid.layers.squeeze(audio, [1])
|
||||
# audio: [bs, time]
|
||||
audio = fluid.layers.reshape(
|
||||
fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1])
|
||||
return audio
|
Loading…
Reference in New Issue