add an cli for cloning examples

This commit is contained in:
iclementine 2020-11-19 18:08:11 +08:00
parent c7e5aaa540
commit a01200e437
13 changed files with 36 additions and 1605 deletions

36
parakeet/__main__.py Normal file
View File

@ -0,0 +1,36 @@
import parakeet
if __name__ == '__main__':
import argparse
import os
import shutil
from pathlib import Path
package_path = Path(__file__).parent
print(package_path)
parser = argparse.ArgumentParser()
subparser = parser.add_subparsers(dest="cmd")
list_exp_parser = subparser.add_parser("list-examples")
clone = subparser.add_parser("clone-example")
clone.add_argument("experiment_name", type=str, help="experiment name")
args = parser.parse_args()
if args.cmd == "list-examples":
print(os.listdir(package_path / "examples"))
exit(0)
if args.cmd == "clone-example":
source = package_path / "examples" / (args.experiment_name)
target = Path(os.getcwd()) / (args.experiment_name)
if not os.path.exists(str(source)):
raise ValueError("{} does not exist".format(str(source)))
if os.path.exists(str(target)):
raise FileExistsError("{} already exists".format(str(target)))
shutil.copytree(str(source), str(target))
print("{} copied!".format(args.experiment_name))
exit(0)

View File

@ -1,15 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .transformer_tts import TransformerTTS
from .vocoder import Vocoder

View File

@ -1,287 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.customized import Pool1D, Conv1D
from parakeet.modules.dynamic_gru import DynamicGRU
import numpy as np
class CBHG(dg.Layer):
def __init__(self,
hidden_size,
batch_size,
K=16,
projection_size=256,
num_gru_layers=2,
max_pool_kernel_size=2,
is_post=False):
"""CBHG Module
Args:
hidden_size (int): dimension of hidden unit.
batch_size (int): batch size of input.
K (int, optional): number of convolution banks. Defaults to 16.
projection_size (int, optional): dimension of projection unit. Defaults to 256.
num_gru_layers (int, optional): number of layers of GRUcell. Defaults to 2.
max_pool_kernel_size (int, optional): max pooling kernel size. Defaults to 2
is_post (bool, optional): whether post processing or not. Defaults to False.
"""
super(CBHG, self).__init__()
self.hidden_size = hidden_size
self.projection_size = projection_size
self.conv_list = []
k = math.sqrt(1.0 / projection_size)
self.conv_list.append(
Conv1D(
num_channels=projection_size,
num_filters=hidden_size,
filter_size=1,
padding=int(np.floor(1 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
k = math.sqrt(1.0 / hidden_size)
for i in range(2, K + 1):
self.conv_list.append(
Conv1D(
num_channels=hidden_size,
num_filters=hidden_size,
filter_size=i,
padding=int(np.floor(i / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(
dg.BatchNorm(
hidden_size, data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
conv_outdim = hidden_size * K
k = math.sqrt(1.0 / conv_outdim)
self.conv_projection_1 = Conv1D(
num_channels=conv_outdim,
num_filters=hidden_size,
filter_size=3,
padding=int(np.floor(3 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1.0 / hidden_size)
self.conv_projection_2 = Conv1D(
num_channels=hidden_size,
num_filters=projection_size,
filter_size=3,
padding=int(np.floor(3 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(
projection_size, data_layout='NCHW')
self.max_pool = Pool1D(
pool_size=max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format="NCT")
self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
k = math.sqrt(1.0 / hidden_size)
self.fc_forward1 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse1 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward1 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse1 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
self.fc_forward2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:, :, :-1]
else:
return x
def forward(self, input_):
"""
Convert linear spectrum to Mel spectrum.
Args:
input_ (Variable): shape(B, C, T), dtype float32, the sequentially input.
Returns:
out (Variable): shape(B, C, T), the CBHG output.
"""
conv_list = []
conv_input = input_
for i, (conv, batchnorm
) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:, :, :-1]
conv_proj = layers.relu(
self.batchnorm_proj_1(
self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0, 2, 1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
fc_forward = self.fc_forward1(highway)
fc_reverse = self.fc_reverse1(highway)
out_forward = self.gru_forward1(fc_forward)
out_reverse = self.gru_reverse1(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
fc_forward = self.fc_forward2(out)
fc_reverse = self.fc_reverse2(out)
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0, 2, 1])
return out
class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4):
"""Highway network
Args:
num_units (int): dimension of hidden unit.
num_layers (int, optional): number of highway layers. Defaults to 4.
"""
super(Highwaynet, self).__init__()
self.num_units = num_units
self.num_layers = num_layers
self.gates = []
self.linears = []
k = math.sqrt(1.0 / num_units)
for i in range(num_layers):
self.linears.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
self.gates.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
def forward(self, input_):
"""
Compute result of Highway network.
Args:
input_(Variable): shape(B, T, C), dtype float32, the sequentially input.
Returns:
out(Variable): the Highway output.
"""
out = input_
for linear, gate in zip(self.linears, self.gates):
h = fluid.layers.relu(linear(out))
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
return out

View File

@ -1,193 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer):
def __init__(self,
num_hidden,
num_mels=80,
outputs_per_step=1,
num_head=4,
n_layers=3):
"""Decoder layer of TransformerTTS.
Args:
num_hidden (int): the number of source vocabulary.
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
num_head (int, optional): the head number of multihead attention. Defaults to 4.
n_layers (int, optional): the layers number of multihead attention. Defaults to 3.
"""
super(Decoder, self).__init__()
self.num_hidden = num_hidden
self.num_head = num_head
param = fluid.ParamAttr()
self.alpha = self.create_parameter(
shape=(1, ),
attr=param,
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(
value=1.0))
self.pos_inp = get_sinusoid_encoding_table(
1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(
input_size=num_mels,
hidden_size=num_hidden * 2,
output_size=num_hidden,
dropout_rate=0.2)
k = math.sqrt(1.0 / num_hidden)
self.linear = dg.Linear(
num_hidden,
num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.selfattn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(n_layers)
]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(n_layers)
]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [
PositionwiseFeedForward(
num_hidden, num_hidden * num_head, filter_size=1)
for _ in range(n_layers)
]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(
num_hidden,
num_mels * outputs_per_step,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.stop_linear = dg.Linear(
num_hidden,
1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(
num_mels,
num_hidden,
filter_size=5,
padding=4,
num_conv=5,
outputs_per_step=outputs_per_step,
use_cudnn=True)
def forward(self, key, value, query, positional, c_mask):
"""
Compute decoder outputs.
Args:
key (Variable): shape(B, T_text, C), dtype float32, the input key of decoder,
where T_text means the timesteps of input text,
value (Variable): shape(B, T_text, C), dtype float32, the input value of decoder.
query (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder,
where T_mel means the timesteps of input spectrum,
positional (Variable): shape(B, T_mel), dtype int64, the spectrum position.
c_mask (Variable): shape(B, T_text, 1), dtype float32, query mask returned from encoder.
Returns:
mel_out (Variable): shape(B, T_mel, C), the decoder output after mel linear projection.
out (Variable): shape(B, T_mel, C), the decoder output after post mel network.
stop_tokens (Variable): shape(B, T_mel, 1), the stop tokens of output.
attn_list (list[Variable]): len(n_layers), the encoder-decoder attention list.
selfattn_list (list[Variable]): len(n_layers), the decoder self attention list.
"""
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
mask = get_dec_attn_key_pad_mask(positional, self.num_head,
query.dtype)
m_mask = get_non_pad_mask(positional, self.num_head, query.dtype)
zero_mask = layers.cast(c_mask == 0, dtype=query.dtype) * -1e30
zero_mask = layers.transpose(zero_mask, perm=[0, 2, 1])
else:
len_q = query.shape[1]
mask = layers.triu(
layers.ones(
shape=[len_q, len_q], dtype=query.dtype),
diagonal=1)
mask = layers.cast(mask != 0, dtype=query.dtype) * -1e30
m_mask, zero_mask = None, None
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
# Get position embedding
positional = self.pos_emb(positional)
query = positional * self.alpha + query
#positional dropout
query = fluid.layers.dropout(
query, 0.1, dropout_implementation='upscale_in_train')
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
self.ffns):
query, attn_dec = selfattn(
query, query, query, mask=mask, query_mask=m_mask)
query, attn_dot = attn(
key, value, query, mask=zero_mask, query_mask=m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
out = self.postconvnet(mel_out)
out = mel_out + out
# Stop tokens
stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1])
stop_tokens = layers.sigmoid(stop_tokens)
return mel_out, out, attn_list, stop_tokens, selfattn_list

View File

@ -1,106 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3):
"""Encoder layer of TransformerTTS.
Args:
embedding_size (int): the size of position embedding.
num_hidden (int): the size of hidden layer in network.
num_head (int, optional): the head number of multihead attention. Defaults to 4.
n_layers (int, optional): the layers number of multihead attention. Defaults to 3.
"""
super(Encoder, self).__init__()
self.num_hidden = num_hidden
self.num_head = num_head
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=1.0))
self.alpha = self.create_parameter(
shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(
1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(
size=[1024, num_hidden],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(
embedding_size=embedding_size,
num_hidden=num_hidden,
use_cudnn=True)
self.layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(n_layers)
]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [
PositionwiseFeedForward(
num_hidden,
num_hidden * num_head,
filter_size=1,
use_cudnn=True) for _ in range(n_layers)
]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional):
"""
Encode text sequence.
Args:
x (Variable): shape(B, T_text), dtype float32, the input character,
where T_text means the timesteps of input text,
positional (Variable): shape(B, T_text), dtype int64, the characters position.
Returns:
x (Variable): shape(B, T_text, C), the encoder output.
attentions (list[Variable]): len(n_layers), the encoder self attention list.
"""
# Encoder pre_network
x = self.encoder_prenet(x)
if fluid.framework._dygraph_tracer()._train_mode:
mask = get_attn_key_pad_mask(positional, self.num_head, x.dtype)
query_mask = get_non_pad_mask(positional, self.num_head, x.dtype)
else:
query_mask, mask = None, None
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x
# Positional dropout
x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train')
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
x = ffn(x)
attentions.append(attention)
return x, attentions, query_mask

View File

@ -1,111 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.customized import Conv1D
import numpy as np
class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
""" Encoder prenet layer of TransformerTTS.
Args:
embedding_size (int): the size of embedding.
num_hidden (int): the size of hidden layer in network.
use_cudnn (bool, optional): use cudnn or not. Defaults to True.
"""
super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding(
size=[len(symbols), embedding_size],
padding_idx=0,
param_attr=fluid.initializer.Normal(
loc=0.0, scale=1.0))
self.conv_list = []
k = math.sqrt(1.0 / embedding_size)
self.conv_list.append(
Conv1D(
num_channels=embedding_size,
num_filters=num_hidden,
filter_size=5,
padding=int(np.floor(5 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1.0 / num_hidden)
for _ in range(2):
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=num_hidden,
filter_size=5,
padding=int(np.floor(5 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [
dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(3)
]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
k = math.sqrt(1.0 / num_hidden)
self.projection = dg.Linear(
num_hidden,
num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x):
"""
Prepare encoder input.
Args:
x (Variable): shape(B, T_text), dtype float32, the input character, where T_text means the timesteps of input text.
Returns:
(Variable): shape(B, T_text, C), the encoder prenet output.
"""
x = self.embedding(x)
x = layers.transpose(x, [0, 2, 1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(
layers.relu(batch_norm(conv(x))),
0.2,
dropout_implementation='upscale_in_train')
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
x = self.projection(x)
return x

View File

@ -1,137 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.customized import Conv1D
class PostConvNet(dg.Layer):
def __init__(self,
n_mels=80,
num_hidden=512,
filter_size=5,
padding=0,
num_conv=5,
outputs_per_step=1,
use_cudnn=True,
dropout=0.1,
batchnorm_last=False):
"""Decocder post conv net of TransformerTTS.
Args:
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
num_hidden (int, optional): the size of hidden layer in network. Defaults to 512.
filter_size (int, optional): the filter size of Conv. Defaults to 5.
padding (int, optional): the padding size of Conv. Defaults to 0.
num_conv (int, optional): the num of Conv layers in network. Defaults to 5.
outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
use_cudnn (bool, optional): use cudnn in Conv or not. Defaults to True.
dropout (float, optional): dropout probability. Defaults to 0.1.
batchnorm_last (bool, optional): if batchnorm at last layer or not. Defaults to False.
"""
super(PostConvNet, self).__init__()
self.dropout = dropout
self.num_conv = num_conv
self.batchnorm_last = batchnorm_last
self.conv_list = []
k = math.sqrt(1.0 / (n_mels * outputs_per_step))
self.conv_list.append(
Conv1D(
num_channels=n_mels * outputs_per_step,
num_filters=num_hidden,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1.0 / num_hidden)
for _ in range(1, num_conv - 1):
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=num_hidden,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=n_mels * outputs_per_step,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [
dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
]
if self.batchnorm_last:
self.batch_norm_list.append(
dg.BatchNorm(
n_mels * outputs_per_step, data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
def forward(self, input):
"""
Compute the mel spectrum.
Args:
input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection.
Returns:
output (Variable): shape(B, T, C), the result after postconvnet.
"""
input = layers.transpose(input, [0, 2, 1])
len = input.shape[-1]
for i in range(self.num_conv - 1):
batch_norm = self.batch_norm_list[i]
conv = self.conv_list[i]
input = layers.dropout(
layers.tanh(batch_norm(conv(input)[:, :, :len])),
self.dropout,
dropout_implementation='upscale_in_train')
conv = self.conv_list[self.num_conv - 1]
input = conv(input)[:, :, :len]
if self.batchnorm_last:
batch_norm = self.batch_norm_list[self.num_conv - 1]
input = layers.dropout(
batch_norm(input),
self.dropout,
dropout_implementation='upscale_in_train')
output = layers.transpose(input, [0, 2, 1])
return output

View File

@ -1,71 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
"""Prenet before passing through the network.
Args:
input_size (int): the input channel size.
hidden_size (int): the size of hidden layer in network.
output_size (int): the output channel size.
dropout_rate (float, optional): dropout probability. Defaults to 0.2.
"""
super(PreNet, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_rate = dropout_rate
k = math.sqrt(1.0 / input_size)
self.linear1 = dg.Linear(
input_size,
hidden_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1.0 / hidden_size)
self.linear2 = dg.Linear(
hidden_size,
output_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x):
"""
Prepare network input.
Args:
x (Variable): shape(B, T, C), dtype float32, the input value.
Returns:
output (Variable): shape(B, T, C), the result after pernet.
"""
x = layers.dropout(
layers.relu(self.linear1(x)),
self.dropout_rate,
dropout_implementation='upscale_in_train')
output = layers.dropout(
layers.relu(self.linear2(x)),
self.dropout_rate,
dropout_implementation='upscale_in_train')
return output

View File

@ -1,71 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.encoder import Encoder
from parakeet.models.transformer_tts.decoder import Decoder
class TransformerTTS(dg.Layer):
def __init__(self,
embedding_size,
num_hidden,
encoder_num_head=4,
encoder_n_layers=3,
n_mels=80,
outputs_per_step=1,
decoder_num_head=4,
decoder_n_layers=3):
"""TransformerTTS model.
Args:
embedding_size (int): the size of position embedding.
num_hidden (int): the size of hidden layer in network.
encoder_num_head (int, optional): the head number of multihead attention in encoder. Defaults to 4.
encoder_n_layers (int, optional): the layers number of multihead attention in encoder. Defaults to 3.
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
decoder_num_head (int, optional): the head number of multihead attention in decoder. Defaults to 4.
decoder_n_layers (int, optional): the layers number of multihead attention in decoder. Defaults to 3.
"""
super(TransformerTTS, self).__init__()
self.encoder = Encoder(embedding_size, num_hidden, encoder_num_head,
encoder_n_layers)
self.decoder = Decoder(num_hidden, n_mels, outputs_per_step,
decoder_num_head, decoder_n_layers)
def forward(self, characters, mel_input, pos_text, pos_mel):
"""
TransformerTTS network.
Args:
characters (Variable): shape(B, T_text), dtype float32, the input character,
where T_text means the timesteps of input text,
mel_input (Variable): shape(B, T_mel, C), dtype float32, the input query of decoder,
where T_mel means the timesteps of input spectrum,
pos_text (Variable): shape(B, T_text), dtype int64, the characters position.
Returns:
mel_output (Variable): shape(B, T_mel, C), the decoder output after mel linear projection.
postnet_output (Variable): shape(B, T_mel, C), the decoder output after post mel network.
stop_preds (Variable): shape(B, T_mel, 1), the stop tokens of output.
attn_probs (list[Variable]): len(n_layers), the encoder-decoder attention list.
attns_enc (list[Variable]): len(n_layers), the encoder self attention list.
attns_dec (list[Variable]): len(n_layers), the decoder self attention list.
"""
key, attns_enc, query_mask = self.encoder(characters, pos_text)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
key, key, mel_input, pos_mel, query_mask)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec

View File

@ -1,101 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import librosa
import os, copy
from scipy import signal
import paddle.fluid.layers as layers
def get_positional_table(d_pos_vec, n_position=1024):
position_enc = np.array(
[[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array(
[get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.
return sinusoid_table
def get_non_pad_mask(seq, num_head, dtype):
mask = layers.cast(seq != 0, dtype=dtype)
mask = layers.unsqueeze(mask, axes=[-1])
mask = layers.expand(mask, [num_head, 1, 1])
return mask
def get_attn_key_pad_mask(seq_k, num_head, dtype):
''' For masking out the padding part of key sequence. '''
# Expand to fit the shape of key query attention matrix.
padding_mask = layers.cast(seq_k == 0, dtype=dtype) * -1e30
padding_mask = layers.unsqueeze(padding_mask, axes=[1])
padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
return padding_mask
def get_dec_attn_key_pad_mask(seq_k, num_head, dtype):
''' For masking out the padding part of key sequence. '''
# Expand to fit the shape of key query attention matrix.
padding_mask = layers.cast(seq_k == 0, dtype=dtype)
padding_mask = layers.unsqueeze(padding_mask, axes=[1])
len_k = seq_k.shape[1]
triu = layers.triu(
layers.ones(
shape=[len_k, len_k], dtype=dtype), diagonal=1)
padding_mask = padding_mask + triu
padding_mask = layers.cast(
padding_mask != 0, dtype=dtype) * -1e30 #* (-2**32 + 1)
padding_mask = layers.expand(padding_mask, [num_head, 1, 1])
return padding_mask
def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32)
for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
**2 / (2 * g * g))
return W
def cross_entropy(input, label, weight=1.0, epsilon=1e-30):
output = -1 * label * layers.log(input + epsilon) - (
1 - label) * layers.log(1 - input + epsilon)
output = output * (label * (weight - 1) + 1)
return layers.reduce_mean(output, dim=[0, 1])

View File

@ -1,55 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D
from parakeet.models.transformer_tts.utils import *
from parakeet.models.transformer_tts.cbhg import CBHG
class Vocoder(dg.Layer):
def __init__(self, batch_size, hidden_size, num_mels=80, n_fft=2048):
"""CBHG Network (mel -> linear)
Args:
batch_size (int): the batch size of input.
hidden_size (int): the size of hidden layer in network.
n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
n_fft (int, optional): length of the windowed signal after padding with zeros. Defaults to 2048.
"""
super(Vocoder, self).__init__()
self.pre_proj = Conv1D(
num_channels=num_mels, num_filters=hidden_size, filter_size=1)
self.cbhg = CBHG(hidden_size, batch_size)
self.post_proj = Conv1D(
num_channels=hidden_size,
num_filters=(n_fft // 2) + 1,
filter_size=1)
def forward(self, mel):
"""
Compute mel spectrum to linear spectrum.
Args:
mel (Variable): shape(B, C, T), dtype float32, the input mel spectrum.
Returns:
mag_pred (Variable): shape(B, T, C), the linear output.
"""
mel = layers.transpose(mel, [0, 2, 1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0, 2, 1])
return mag_pred

View File

@ -1,15 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule

View File

@ -1,443 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
from parakeet.modules import weight_norm
def get_param_attr(layer_type, filter_size, c_in=1):
if layer_type == "weight_norm":
k = np.sqrt(1.0 / (c_in * np.prod(filter_size)))
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
elif layer_type == "common":
weight_init = fluid.initializer.ConstantInitializer(0.0)
bias_init = fluid.initializer.ConstantInitializer(0.0)
else:
raise TypeError("Unsupported layer type.")
param_attr = fluid.ParamAttr(initializer=weight_init)
bias_attr = fluid.ParamAttr(initializer=bias_init)
return param_attr, bias_attr
def unfold(x, n_group):
length = x.shape[-1]
new_shape = x.shape[:-1] + [length // n_group, n_group]
return fluid.layers.reshape(x, new_shape)
class WaveFlowLoss:
def __init__(self, sigma=1.0):
self.sigma = sigma
def __call__(self, model_output):
z, log_s_list = model_output
for i, log_s in enumerate(log_s_list):
if i == 0:
log_s_total = fluid.layers.reduce_sum(log_s)
else:
log_s_total = log_s_total + fluid.layers.reduce_sum(log_s)
loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \
- log_s_total
loss = loss / np.prod(z.shape)
const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
return loss + const
class Conditioner(dg.Layer):
def __init__(self, dtype, upsample_factors):
super(Conditioner, self).__init__()
self.upsample_conv2d = []
for s in upsample_factors:
in_channel = 1
param_attr, bias_attr = get_param_attr(
"weight_norm", (3, 2 * s), c_in=in_channel)
conv_trans2d = weight_norm.Conv2DTranspose(
num_channels=in_channel,
num_filters=1,
filter_size=(3, 2 * s),
padding=(1, s // 2),
stride=(1, s),
param_attr=param_attr,
bias_attr=bias_attr,
dtype=dtype)
self.upsample_conv2d.append(conv_trans2d)
for i, layer in enumerate(self.upsample_conv2d):
self.add_sublayer("conv2d_transpose_{}".format(i), layer)
def forward(self, x):
x = fluid.layers.unsqueeze(x, 1)
for layer in self.upsample_conv2d:
x = layer(x)
x = fluid.layers.leaky_relu(x, alpha=0.4)
return fluid.layers.squeeze(x, [1])
def infer(self, x):
x = fluid.layers.unsqueeze(x, 1)
for layer in self.upsample_conv2d:
x = layer(x)
# Trim conv artifacts.
time_cutoff = layer._filter_size[1] - layer._stride[1]
x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4)
return fluid.layers.squeeze(x, [1])
class Flow(dg.Layer):
def __init__(self, config):
super(Flow, self).__init__()
self.n_layers = config.n_layers
self.n_channels = config.n_channels
self.kernel_h = config.kernel_h
self.kernel_w = config.kernel_w
self.dtype = "float16" if config.use_fp16 else "float32"
# Transform audio: [batch, 1, n_group, time/n_group]
# => [batch, n_channels, n_group, time/n_group]
param_attr, bias_attr = get_param_attr("weight_norm", (1, 1), c_in=1)
self.start = weight_norm.Conv2D(
num_channels=1,
num_filters=self.n_channels,
filter_size=(1, 1),
param_attr=param_attr,
bias_attr=bias_attr,
dtype=self.dtype)
# Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability
# output shape: [batch, 2, n_group, time/n_group]
param_attr, bias_attr = get_param_attr(
"common", (1, 1), c_in=self.n_channels)
self.end = dg.Conv2D(
num_channels=self.n_channels,
num_filters=2,
filter_size=(1, 1),
param_attr=param_attr,
bias_attr=bias_attr,
dtype=self.dtype)
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
dilation_dict = {
8: [1, 1, 1, 1, 1, 1, 1, 1],
16: [1, 1, 1, 1, 1, 1, 1, 1],
32: [1, 2, 4, 1, 2, 4, 1, 2],
64: [1, 2, 4, 8, 16, 1, 2, 4],
128: [1, 2, 4, 8, 16, 32, 64, 1]
}
self.dilation_h_list = dilation_dict[config.n_group]
self.in_layers = []
self.cond_layers = []
self.res_skip_layers = []
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2**i
param_attr, bias_attr = get_param_attr(
"weight_norm", (self.kernel_h, self.kernel_w),
c_in=self.n_channels)
in_layer = weight_norm.Conv2D(
num_channels=self.n_channels,
num_filters=2 * self.n_channels,
filter_size=(self.kernel_h, self.kernel_w),
dilation=(dilation_h, dilation_w),
param_attr=param_attr,
bias_attr=bias_attr,
dtype=self.dtype)
self.in_layers.append(in_layer)
param_attr, bias_attr = get_param_attr(
"weight_norm", (1, 1), c_in=config.mel_bands)
cond_layer = weight_norm.Conv2D(
num_channels=config.mel_bands,
num_filters=2 * self.n_channels,
filter_size=(1, 1),
param_attr=param_attr,
bias_attr=bias_attr,
dtype=self.dtype)
self.cond_layers.append(cond_layer)
if i < self.n_layers - 1:
res_skip_channels = 2 * self.n_channels
else:
res_skip_channels = self.n_channels
param_attr, bias_attr = get_param_attr(
"weight_norm", (1, 1), c_in=self.n_channels)
res_skip_layer = weight_norm.Conv2D(
num_channels=self.n_channels,
num_filters=res_skip_channels,
filter_size=(1, 1),
param_attr=param_attr,
bias_attr=bias_attr,
dtype=self.dtype)
self.res_skip_layers.append(res_skip_layer)
self.add_sublayer("in_layer_{}".format(i), in_layer)
self.add_sublayer("cond_layer_{}".format(i), cond_layer)
self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer)
def forward(self, audio, mel):
# audio: [bs, 1, n_group, time/group]
# mel: [bs, mel_bands, n_group, time/n_group]
audio = self.start(audio)
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2**i
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
pad_left = pad_right = int((self.kernel_w - 1) * dilation_w / 2)
# Using pad2d is a bit faster than using padding in Conv2D directly
audio_pad = fluid.layers.pad2d(
audio, paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](audio_pad)
cond_hidden = self.cond_layers[i](mel)
in_acts = hidden + cond_hidden
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
res_skip_acts = self.res_skip_layers[i](out_acts)
if i < self.n_layers - 1:
audio += res_skip_acts[:, :self.n_channels, :, :]
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
else:
skip_acts = res_skip_acts
if i == 0:
output = skip_acts
else:
output += skip_acts
return self.end(output)
def infer(self, audio, mel, queues):
audio = self.start(audio)
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2**i
state_size = dilation_h * (self.kernel_h - 1)
queue = queues[i]
if len(queue) == 0:
for j in range(state_size):
queue.append(fluid.layers.zeros_like(audio))
state = queue[0:state_size]
state = fluid.layers.concat(state + [audio], axis=2)
queue.pop(0)
queue.append(audio)
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = 0, 0
pad_left = int((self.kernel_w - 1) * dilation_w / 2)
pad_right = int((self.kernel_w - 1) * dilation_w / 2)
state = fluid.layers.pad2d(
state, paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](state)
cond_hidden = self.cond_layers[i](mel)
in_acts = hidden + cond_hidden
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
res_skip_acts = self.res_skip_layers[i](out_acts)
if i < self.n_layers - 1:
audio += res_skip_acts[:, :self.n_channels, :, :]
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
else:
skip_acts = res_skip_acts
if i == 0:
output = skip_acts
else:
output += skip_acts
return self.end(output)
class WaveFlowModule(dg.Layer):
"""WaveFlow model implementation.
Args:
config (obj): model configuration parameters.
Returns:
WaveFlowModule
"""
def __init__(self, config):
super(WaveFlowModule, self).__init__()
self.n_flows = config.n_flows
self.n_group = config.n_group
self.n_layers = config.n_layers
self.upsample_factors = config.upsample_factors if hasattr(
config, "upsample_factors") else [16, 16]
assert self.n_group % 2 == 0
assert self.n_flows % 2 == 0
self.dtype = "float16" if config.use_fp16 else "float32"
self.conditioner = Conditioner(self.dtype, self.upsample_factors)
self.flows = []
for i in range(self.n_flows):
flow = Flow(config)
self.flows.append(flow)
self.add_sublayer("flow_{}".format(i), flow)
self.perms = []
half = self.n_group // 2
for i in range(self.n_flows):
perm = list(range(self.n_group))
if i < self.n_flows // 2:
perm = perm[::-1]
else:
perm[:half] = reversed(perm[:half])
perm[half:] = reversed(perm[half:])
self.perms.append(perm)
def forward(self, audio, mel):
"""Training forward pass.
Use a conditioner to upsample mel spectrograms into hidden states.
These hidden states along with the audio are passed to a stack of Flow
modules to obtain the final latent variable z and a list of log scaling
variables, which are then passed to the WaveFlowLoss module to calculate
the negative log likelihood.
Args:
audio (obj): audio samples.
mel (obj): mel spectrograms.
Returns:
z (obj): latent variable.
log_s_list(list): list of log scaling variables.
"""
mel = self.conditioner(mel)
assert mel.shape[2] >= audio.shape[1]
# Prune out the tail of audio/mel so that time/n_group == 0.
pruned_len = int(audio.shape[1] // self.n_group * self.n_group)
if audio.shape[1] > pruned_len:
audio = audio[:, :pruned_len]
if mel.shape[2] > pruned_len:
mel = mel[:, :, :pruned_len]
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
# From [bs, time] to [bs, n_group, time/n_group]
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
# [bs, 1, n_group, time/n_group]
audio = fluid.layers.unsqueeze(audio, 1)
log_s_list = []
for i in range(self.n_flows):
inputs = audio[:, :, :-1, :]
conds = mel[:, :, 1:, :]
outputs = self.flows[i](inputs, conds)
log_s = outputs[:, :1, :, :]
b = outputs[:, 1:, :, :]
log_s_list.append(log_s)
audio_0 = audio[:, :, :1, :]
audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b
audio = fluid.layers.concat([audio_0, audio_out], axis=2)
# Permute over the height dim.
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
audio = fluid.layers.stack(audio_slices, axis=2)
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
mel = fluid.layers.stack(mel_slices, axis=2)
z = fluid.layers.squeeze(audio, [1])
return z, log_s_list
def synthesize(self, mel, sigma=1.0):
"""Use model to synthesize waveform.
Use a conditioner to upsample mel spectrograms into hidden states.
These hidden states along with initial random gaussian latent variable
are passed to a stack of Flow modules to obtain the audio output.
Note that we use convolutional queue (https://arxiv.org/abs/1611.09482)
to cache the intermediate hidden states, which will speed up the
autoregressive inference over the height dimension. Current
implementation only supports height dimension (self.n_group) equals
8 or 16, i.e., where there is no dilation on the height dimension.
Args:
mel (obj): mel spectrograms.
sigma (float, optional): standard deviation of the guassian latent
variable. Defaults to 1.0.
Returns:
audio (obj): synthesized audio.
"""
if self.dtype == "float16":
mel = fluid.layers.cast(mel, self.dtype)
mel = self.conditioner.infer(mel)
# Prune out the tail of mel so that time/n_group == 0.
pruned_len = int(mel.shape[2] // self.n_group * self.n_group)
if mel.shape[2] > pruned_len:
mel = mel[:, :, :pruned_len]
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
audio = fluid.layers.gaussian_random(
shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma)
if self.dtype == "float16":
audio = fluid.layers.cast(audio, self.dtype)
for i in reversed(range(self.n_flows)):
# Permute over the height dimension.
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
audio = fluid.layers.stack(audio_slices, axis=2)
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
mel = fluid.layers.stack(mel_slices, axis=2)
audio_list = []
audio_0 = audio[:, :, 0:1, :]
audio_list.append(audio_0)
audio_h = audio_0
queues = [[] for _ in range(self.n_layers)]
for h in range(1, self.n_group):
inputs = audio_h
conds = mel[:, :, h:(h + 1), :]
outputs = self.flows[i].infer(inputs, conds, queues)
log_s = outputs[:, 0:1, :, :]
b = outputs[:, 1:, :, :]
audio_h = (audio[:, :, h:(h+1), :] - b) / \
fluid.layers.exp(log_s)
audio_list.append(audio_h)
audio = fluid.layers.concat(audio_list, axis=2)
# audio: [bs, n_group, time/n_group]
audio = fluid.layers.squeeze(audio, [1])
# audio: [bs, time]
audio = fluid.layers.reshape(
fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1])
return audio