add transformerTTS and fastspeech

This commit is contained in:
lifuchen 2020-02-10 07:38:29 +00:00 committed by chenfeiyu
parent d0015239db
commit 185e25fedf
25 changed files with 356 additions and 756 deletions

View File

@ -0,0 +1,4 @@
# Fastspeech
Paddle fluid implementation of Fastspeech, a feed-forward network based on Transformer. The implementation is based on [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263).
We implement Fastspeech model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.

View File

@ -35,10 +35,10 @@ epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: False
use_data_parallel: True
data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint/
data_path: ../../dataset/LJSpeech-1.1
transtts_path: ../TransformerTTS/checkpoint/
transformer_step: 200000
save_path: ./checkpoint
log_dir: ./log

View File

@ -14,9 +14,9 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.network import TransformerTTS
from network import FastSpeech
from utils import get_alignment
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
from parakeet.models.fastspeech.fastspeech import FastSpeech
from parakeet.models.fastspeech.utils import get_alignment
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))

View File

@ -0,0 +1,4 @@
# TransformerTTS
Paddle fluid implementation of TransformerTTS, a neural TTS with Transformer. The implementation is based on [Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895).
We implement TransformerTTS model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.

View File

@ -23,10 +23,10 @@ lr: 0.001
save_step: 1000
image_step: 2000
use_gpu: True
use_data_parallel: False
use_data_parallel: True
stop_token: False
data_path: ../../../dataset/LJSpeech-1.1
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint

View File

@ -20,9 +20,9 @@ epochs: 10000
lr: 0.001
save_step: 10
use_gpu: True
use_data_parallel: False
use_data_parallel: True
data_path: ../../../dataset/LJSpeech-1.1
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint

View File

@ -7,11 +7,13 @@ import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
from matplotlib import cm
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.modules.utils import cross_entropy
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from network import *
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -86,10 +88,12 @@ def main(cfg):
if local_rank==0:
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy(),
'stop_loss':stop_loss.numpy()
'post_mel_loss':post_mel_loss.numpy()
}, global_step)
if cfg.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(),

View File

@ -6,11 +6,14 @@ from collections import OrderedDict
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from network import *
from parakeet.models.transformerTTS.vocoder import Vocoder
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
for param in model_dict:
if param.startswith('_layers.'):
@ -40,7 +43,7 @@ def main(cfg):
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
model = ModelPostNet(cfg)
model = Vocoder(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
@ -99,5 +102,5 @@ def main(cfg):
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split())
main(cfg)

View File

@ -1,3 +1,3 @@
__version__ = "0.0.0"
from . import data, g2p, models, modules, utils
from . import data, g2p, models, modules

View File

View File

@ -0,0 +1,152 @@
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
import csv
from paddle import fluid
from parakeet import g2p
from parakeet import audio
from parakeet.data.sampler import *
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
assert config.batch_size % nranks == 0
each_bs = config.batch_size // nranks
if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
self.reader = fluid.io.DataLoader.from_generator(
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
self.reader.set_batch_generator(dataloader, place)
class LJSpeechMetaData(DatasetMixin):
def __init__(self, root):
self.root = Path(root)
self._wav_dir = self.root.joinpath("wavs")
csv_path = self.root.joinpath("metadata.csv")
self._table = pd.read_csv(
csv_path,
sep="|",
header=None,
quoting=csv.QUOTE_NONE,
names=["fname", "raw_text", "normalized_text"])
def get_example(self, i):
fname, raw_text, normalized_text = self._table.iloc[i]
fname = str(self._wav_dir.joinpath(fname + ".wav"))
return fname, raw_text, normalized_text
def __len__(self):
return len(self._table)
class LJSpeech(object):
def __init__(self, config):
super(LJSpeech, self).__init__()
self.config = config
self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config.audio.sr,
num_mels=config.audio.num_mels,
min_level_db=config.audio.min_level_db,
ref_level_db=config.audio.ref_level_db,
n_fft=config.audio.n_fft,
win_length= config.audio.win_length,
hop_length= config.audio.hop_length,
power=config.audio.power,
preemphasis=config.audio.preemphasis,
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
mel_fmin=0,
mel_fmax=None,
clip_norm=True,
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
def __call__(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname, raw_text, normalized_text = metadatum
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = self._ljspeech_processor.load_wav(str(fname))
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def batch_examples(batch):
texts = []
mels = []
mel_inputs = []
mel_lens = []
text_lens = []
pos_texts = []
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
mel_lens.append(mel.shape[1])
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
def batch_examples_vocoder(batch):
mels=[]
mags=[]
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
return (mels, mags)

View File

@ -0,0 +1,36 @@
import numpy as np
import math
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
"""
Feed Forward Transformer block in FastSpeech.
Args:
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
T means the timesteps of input.
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
len_q means the sequence length of query, len_k means the sequence length of key.
Returns:
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
"""
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask
output = self.pos_ffn(output)
output *= non_pad_mask
return output, slf_attn

View File

@ -1,42 +1,10 @@
import numpy as np
import math
import utils
import parakeet.models.fastspeech.utils
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.layers import Conv, Linear
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
"""
Feed Forward Transformer block in FastSpeech.
Args:
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
T means the timesteps of input.
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
len_q means the sequence length of query, len_k means the sequence length of key.
Returns:
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
"""
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask
output = self.pos_ffn(output)
output *= non_pad_mask
return output, slf_attn
class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):

View File

@ -0,0 +1,63 @@
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols
from parakeet.modules.utils import *
from parakeet.modules.post_convnet import PostConvNet
from parakeet.modules.layers import Linear
from parakeet.models.fastspeech.FFTBlock import FFTBlock
class Decoder(dg.Layer):
def __init__(self,
len_max_seq,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos):
"""
Decoder layer of FastSpeech.
Args:
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
The output of length regulator.
enc_pos (Variable, optional): Shape(B, T_mel),
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
Returns:
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
dec_slf_attn_list = []
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
non_pad_mask = get_non_pad_mask(enc_pos)
# -- Forward
dec_output = enc_seq + self.position_enc(enc_pos)
for dec_layer in self.layer_stack:
dec_output, dec_slf_attn = dec_layer(
dec_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list

View File

@ -0,0 +1,67 @@
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols
from parakeet.modules.utils import *
from parakeet.modules.post_convnet import PostConvNet
from parakeet.modules.layers import Linear
from parakeet.models.fastspeech.FFTBlock import FFTBlock
class Encoder(dg.Layer):
def __init__(self,
n_src_vocab,
len_max_seq,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, character, text_pos):
"""
Encoder layer of FastSpeech.
Args:
character (Variable): Shape(B, T_text), dtype: float32. The input text
characters. T_text means the timesteps of input characters.
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
position. T_text means the timesteps of input characters.
Returns:
enc_output (Variable), Shape(B, text_T, C), the encoder output.
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
"""
enc_slf_attn_list = []
# -- prepare masks
# shape character (N, T)
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
non_pad_mask = get_non_pad_mask(character)
# -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list

View File

@ -4,124 +4,10 @@ from parakeet.g2p.text.symbols import symbols
from parakeet.modules.utils import *
from parakeet.modules.post_convnet import PostConvNet
from parakeet.modules.layers import Linear
from utils import *
from modules import FFTBlock, LengthRegulator
class Encoder(dg.Layer):
def __init__(self,
n_src_vocab,
len_max_seq,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, character, text_pos):
"""
Encoder layer of FastSpeech.
Args:
character (Variable): Shape(B, T_text), dtype: float32. The input text
characters. T_text means the timesteps of input characters.
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
position. T_text means the timesteps of input characters.
Returns:
enc_output (Variable), Shape(B, text_T, C), the encoder output.
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
"""
enc_slf_attn_list = []
# -- prepare masks
# shape character (N, T)
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
non_pad_mask = get_non_pad_mask(character)
# -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list
class Decoder(dg.Layer):
def __init__(self,
len_max_seq,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos):
"""
Decoder layer of FastSpeech.
Args:
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
The output of length regulator.
enc_pos (Variable, optional): Shape(B, T_mel),
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
Returns:
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
dec_slf_attn_list = []
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
non_pad_mask = get_non_pad_mask(enc_pos)
# -- Forward
dec_output = enc_seq + self.position_enc(enc_pos)
for dec_layer in self.layer_stack:
dec_output, dec_slf_attn = dec_layer(
dec_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list
from parakeet.models.fastspeech.utils import *
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder
class FastSpeech(dg.Layer):
def __init__(self, cfg):

View File

@ -1,166 +0,0 @@
import math
import numpy as np
import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Conv1D(dg.Layer):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def __init__(self,
in_channels,
num_filters,
filter_size=3,
padding=0,
dilation=1,
stride=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
data_format='NCT',
dtype="float32"):
super(Conv1D, self).__init__(dtype=dtype)
self.padding = padding
self.in_channels = in_channels
self.num_filters = num_filters
self.filter_size = filter_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.act = act
self.data_format = data_format
self.conv = dg.Conv2D(
in_channels=in_channels,
num_filters=num_filters,
filter_size=(1, filter_size),
stride=(1, stride),
dilation=(1, dilation),
padding=(0, padding),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.conv(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
pool_size=-1,
pool_type='max',
pool_stride=1,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
exclusive=True,
data_format='NCT',
dtype='float32'):
super(Pool1D, self).__init__(dtype=dtype)
self.pool_size = pool_size
self.pool_type = pool_type
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.global_pooling = global_pooling
self.use_cudnn = use_cudnn
self.ceil_mode = ceil_mode
self.exclusive = exclusive
self.data_format = data_format
self.dtype = dtype
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.pool2d(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class DynamicGRU(dg.Layer):
def __init__(self,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__()
self.gru_unit = dg.GRUUnit(
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res

View File

@ -1,218 +0,0 @@
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Pool1D, Linear
from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np
class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
padding_idx = None)
self.conv_list = []
self.conv_list.append(Conv(in_channels = embedding_size,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT"))
for _ in range(2):
self.conv_list.append(Conv(in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.projection = Linear(num_hidden, num_hidden)
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = self.projection(x)
return x
class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False):
super(CBHG, self).__init__()
"""
:param hidden_size: dimension of hidden unit
:param batch_size: batch size
:param K: # of convolution banks
:param projection_size: dimension of projection unit
:param num_gru_layers: # of layers of GRUcell
:param max_pool_kernel_size: max pooling kernel size
:param is_post: whether post processing or not
"""
self.hidden_size = hidden_size
self.projection_size = projection_size
self.conv_list = []
self.conv_list.append(Conv(in_channels = projection_size,
out_channels = hidden_size,
filter_size = 1,
padding = int(np.floor(1/2)),
data_format = "NCT"))
for i in range(2,K+1):
self.conv_list.append(Conv(in_channels = hidden_size,
out_channels = hidden_size,
filter_size = i,
padding = int(np.floor(i/2)),
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
conv_outdim = hidden_size * K
self.conv_projection_1 = Conv(in_channels = conv_outdim,
out_channels = hidden_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.conv_projection_2 = Conv(in_channels = hidden_size,
out_channels = projection_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format = "NCT")
self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:,:,:-1]
else:
return x
def forward(self, input_):
# input_.shape = [N, C, T]
conv_list = []
conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
fc_forward = self.fc_forward1(highway)
fc_reverse = self.fc_reverse1(highway)
out_forward = self.gru_forward1(fc_forward)
out_reverse = self.gru_reverse1(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
fc_forward = self.fc_forward2(out)
fc_reverse = self.fc_reverse2(out)
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1])
return out
class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__()
self.num_units = num_units
self.num_layers = num_layers
self.gates = []
self.linears = []
for i in range(num_layers):
self.linears.append(Linear(num_units, num_units))
self.gates.append(Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
def forward(self, input_):
out = input_
for linear, gate in zip(self.linears, self.gates):
h = fluid.layers.relu(linear(out))
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
return out

View File

@ -1,203 +0,0 @@
from parakeet.models.transformerTTS.module import *
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
from parakeet.modules.prenet import PreNet
from parakeet.modules.post_convnet import PostConvNet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config, num_head=4):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional):
if fluid.framework._dygraph_tracer()._train_mode:
query_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask(positional, x)
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x = ffn(x)
attentions.append(attention)
return x, query_mask, attentions
class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.linear = Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = Linear(num_hidden, 1)
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step,
use_cudnn = config.use_gpu)
def forward(self, key, value, query, c_mask, positional):
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
# Get position embedding
positional = self.pos_emb(positional)
query = positional * self.alpha + query
#positional dropout
query = fluid.layers.dropout(query, 0.1)
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
out = self.postconvnet(mel_out)
out = mel_out + out
# Stop tokens
stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1])
stop_tokens = layers.sigmoid(stop_tokens)
return mel_out, out, attn_list, stop_tokens, selfattn_list
class TransformerTTS(dg.Layer):
def __init__(self, config):
super(TransformerTTS, self).__init__()
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
self.decoder = Decoder(config.hidden_size, config)
self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# attn_probs (128, mel_len, seq_len)
# stop_preds (batch_size, mel_len, 1)
# attns_dec (128, mel_len, mel_len)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
class ModelPostNet(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, config):
super(ModelPostNet, self).__init__()
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
out_channels = config.hidden_size,
filter_size=1,
data_format = "NCT")
self.cbhg = CBHG(config.hidden_size, config.batch_size)
self.post_proj = Conv1D(in_channels = config.hidden_size,
out_channels = (config.audio.n_fft // 2) + 1,
filter_size=1,
data_format = "NCT")
def forward(self, mel):
mel = layers.transpose(mel, [0,2,1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1])
return mag_pred