diff --git a/examples/FastSpeech/README.md b/examples/FastSpeech/README.md new file mode 100644 index 0000000..7e663e3 --- /dev/null +++ b/examples/FastSpeech/README.md @@ -0,0 +1,4 @@ +# Fastspeech +Paddle fluid implementation of Fastspeech, a feed-forward network based on Transformer. The implementation is based on [FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263). + +We implement Fastspeech model in paddle fluid with dynamic graph, which is convenient for flexible network architectures. \ No newline at end of file diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/examples/FastSpeech/config/fastspeech.yaml similarity index 84% rename from parakeet/models/fastspeech/config/fastspeech.yaml rename to examples/FastSpeech/config/fastspeech.yaml index 90f520f..9a8a135 100644 --- a/parakeet/models/fastspeech/config/fastspeech.yaml +++ b/examples/FastSpeech/config/fastspeech.yaml @@ -35,10 +35,10 @@ epochs: 10000 lr: 0.001 save_step: 500 use_gpu: True -use_data_parallel: False +use_data_parallel: True -data_path: ../../../dataset/LJSpeech-1.1 -transtts_path: ../transformerTTS/checkpoint/ +data_path: ../../dataset/LJSpeech-1.1 +transtts_path: ../TransformerTTS/checkpoint/ transformer_step: 200000 save_path: ./checkpoint log_dir: ./log diff --git a/parakeet/models/fastspeech/config/synthesis.yaml b/examples/FastSpeech/config/synthesis.yaml similarity index 100% rename from parakeet/models/fastspeech/config/synthesis.yaml rename to examples/FastSpeech/config/synthesis.yaml diff --git a/parakeet/models/fastspeech/parse.py b/examples/FastSpeech/parse.py similarity index 100% rename from parakeet/models/fastspeech/parse.py rename to examples/FastSpeech/parse.py diff --git a/parakeet/models/fastspeech/synthesis.py b/examples/FastSpeech/synthesis.py similarity index 100% rename from parakeet/models/fastspeech/synthesis.py rename to examples/FastSpeech/synthesis.py diff --git a/parakeet/models/fastspeech/train.py b/examples/FastSpeech/train.py similarity index 96% rename from parakeet/models/fastspeech/train.py rename to examples/FastSpeech/train.py index 2caf1e9..45102e1 100644 --- a/parakeet/models/fastspeech/train.py +++ b/examples/FastSpeech/train.py @@ -14,9 +14,9 @@ import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers import paddle.fluid as fluid from parakeet.models.dataloader.ljspeech import LJSpeechLoader -from parakeet.models.transformerTTS.network import TransformerTTS -from network import FastSpeech -from utils import get_alignment +from parakeet.models.transformerTTS.transformerTTS import TransformerTTS +from parakeet.models.fastspeech.fastspeech import FastSpeech +from parakeet.models.fastspeech.utils import get_alignment def load_checkpoint(step, model_path): model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) diff --git a/examples/TransformerTTS/README.md b/examples/TransformerTTS/README.md new file mode 100644 index 0000000..cb3e15f --- /dev/null +++ b/examples/TransformerTTS/README.md @@ -0,0 +1,4 @@ +# TransformerTTS +Paddle fluid implementation of TransformerTTS, a neural TTS with Transformer. The implementation is based on [Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895). + +We implement TransformerTTS model in paddle fluid with dynamic graph, which is convenient for flexible network architectures. \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/synthesis.yaml b/examples/TransformerTTS/config/synthesis.yaml similarity index 100% rename from parakeet/models/transformerTTS/config/synthesis.yaml rename to examples/TransformerTTS/config/synthesis.yaml diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/examples/TransformerTTS/config/train_transformer.yaml similarity index 87% rename from parakeet/models/transformerTTS/config/train_transformer.yaml rename to examples/TransformerTTS/config/train_transformer.yaml index 97b5f0f..fb94a41 100644 --- a/parakeet/models/transformerTTS/config/train_transformer.yaml +++ b/examples/TransformerTTS/config/train_transformer.yaml @@ -23,10 +23,10 @@ lr: 0.001 save_step: 1000 image_step: 2000 use_gpu: True -use_data_parallel: False +use_data_parallel: True stop_token: False -data_path: ../../../dataset/LJSpeech-1.1 +data_path: ../../dataset/LJSpeech-1.1 save_path: ./checkpoint log_dir: ./log #checkpoint_path: ./checkpoint diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/examples/TransformerTTS/config/train_vocoder.yaml similarity index 81% rename from parakeet/models/transformerTTS/config/train_postnet.yaml rename to examples/TransformerTTS/config/train_vocoder.yaml index c9bd487..3c37d4a 100644 --- a/parakeet/models/transformerTTS/config/train_postnet.yaml +++ b/examples/TransformerTTS/config/train_vocoder.yaml @@ -20,9 +20,9 @@ epochs: 10000 lr: 0.001 save_step: 10 use_gpu: True -use_data_parallel: False +use_data_parallel: True -data_path: ../../../dataset/LJSpeech-1.1 +data_path: ../../dataset/LJSpeech-1.1 save_path: ./checkpoint log_dir: ./log #checkpoint_path: ./checkpoint diff --git a/parakeet/models/transformerTTS/parse.py b/examples/TransformerTTS/parse.py similarity index 100% rename from parakeet/models/transformerTTS/parse.py rename to examples/TransformerTTS/parse.py diff --git a/parakeet/models/transformerTTS/synthesis.py b/examples/TransformerTTS/synthesis.py similarity index 100% rename from parakeet/models/transformerTTS/synthesis.py rename to examples/TransformerTTS/synthesis.py diff --git a/parakeet/models/transformerTTS/train_transformer.py b/examples/TransformerTTS/train_transformer.py similarity index 94% rename from parakeet/models/transformerTTS/train_transformer.py rename to examples/TransformerTTS/train_transformer.py index f8e85b0..4d046cb 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/examples/TransformerTTS/train_transformer.py @@ -7,11 +7,13 @@ import jsonargparse from parse import add_config_options_to_parser from pprint import pprint from matplotlib import cm +import numpy as np +import paddle.fluid as fluid import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers from parakeet.modules.utils import cross_entropy from parakeet.models.dataloader.ljspeech import LJSpeechLoader -from network import * +from parakeet.models.transformerTTS.transformerTTS import TransformerTTS def load_checkpoint(step, model_path): model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) @@ -86,10 +88,12 @@ def main(cfg): if local_rank==0: writer.add_scalars('training_loss', { 'mel_loss':mel_loss.numpy(), - 'post_mel_loss':post_mel_loss.numpy(), - 'stop_loss':stop_loss.numpy() + 'post_mel_loss':post_mel_loss.numpy() }, global_step) + if cfg.stop_token: + writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) + writer.add_scalars('alphas', { 'encoder_alpha':model.encoder.alpha.numpy(), 'decoder_alpha':model.decoder.alpha.numpy(), diff --git a/parakeet/models/transformerTTS/train_postnet.py b/examples/TransformerTTS/train_vocoder.py similarity index 91% rename from parakeet/models/transformerTTS/train_postnet.py rename to examples/TransformerTTS/train_vocoder.py index 24d80d1..b73f28c 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/examples/TransformerTTS/train_vocoder.py @@ -6,11 +6,14 @@ from collections import OrderedDict import jsonargparse from parse import add_config_options_to_parser from pprint import pprint +import paddle.fluid as fluid +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers from parakeet.models.dataloader.ljspeech import LJSpeechLoader -from network import * +from parakeet.models.transformerTTS.vocoder import Vocoder def load_checkpoint(step, model_path): - model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) new_state_dict = OrderedDict() for param in model_dict: if param.startswith('_layers.'): @@ -40,7 +43,7 @@ def main(cfg): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - model = ModelPostNet(cfg) + model = Vocoder(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), @@ -99,5 +102,5 @@ def main(cfg): if __name__ == '__main__': parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split()) + cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split()) main(cfg) \ No newline at end of file diff --git a/parakeet/__init__.py b/parakeet/__init__.py index 328cdce..9dbb99b 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -1,3 +1,3 @@ __version__ = "0.0.0" -from . import data, g2p, models, modules, utils +from . import data, g2p, models, modules diff --git a/parakeet/models/__init__.py b/parakeet/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/dataloader/ljspeech.py b/parakeet/models/dataloader/ljspeech.py new file mode 100644 index 0000000..21f8fc9 --- /dev/null +++ b/parakeet/models/dataloader/ljspeech.py @@ -0,0 +1,152 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa +import csv + +from paddle import fluid +from parakeet import g2p +from parakeet import audio +from parakeet.data.sampler import * +from parakeet.data.datacargo import DataCargo +from parakeet.data.batch import TextIDBatcher, SpecBatcher +from parakeet.data.dataset import DatasetMixin, TransformDataset + +class LJSpeechLoader: + def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + LJSPEECH_ROOT = Path(config.data_path) + metadata = LJSpeechMetaData(LJSPEECH_ROOT) + transformer = LJSpeech(config) + dataset = TransformDataset(metadata, transformer) + sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) + + assert config.batch_size % nranks == 0 + each_bs = config.batch_size // nranks + if is_vocoder: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) + else: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True) + + self.reader = fluid.io.DataLoader.from_generator( + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + self.reader.set_batch_generator(dataloader, place) + + +class LJSpeechMetaData(DatasetMixin): + def __init__(self, root): + self.root = Path(root) + self._wav_dir = self.root.joinpath("wavs") + csv_path = self.root.joinpath("metadata.csv") + self._table = pd.read_csv( + csv_path, + sep="|", + header=None, + quoting=csv.QUOTE_NONE, + names=["fname", "raw_text", "normalized_text"]) + + def get_example(self, i): + fname, raw_text, normalized_text = self._table.iloc[i] + fname = str(self._wav_dir.joinpath(fname + ".wav")) + return fname, raw_text, normalized_text + + def __len__(self): + return len(self._table) + + +class LJSpeech(object): + def __init__(self, config): + super(LJSpeech, self).__init__() + self.config = config + self._ljspeech_processor = audio.AudioProcessor( + sample_rate=config.audio.sr, + num_mels=config.audio.num_mels, + min_level_db=config.audio.min_level_db, + ref_level_db=config.audio.ref_level_db, + n_fft=config.audio.n_fft, + win_length= config.audio.win_length, + hop_length= config.audio.hop_length, + power=config.audio.power, + preemphasis=config.audio.preemphasis, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + + def __call__(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + fname, raw_text, normalized_text = metadatum + + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = self._ljspeech_processor.load_wav(str(fname)) + mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + mel_lens = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + mel_lens.append(mel.shape[1]) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) #(B, T) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens)) + +def batch_examples_vocoder(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + + + diff --git a/parakeet/models/fastspeech/FFTBlock.py b/parakeet/models/fastspeech/FFTBlock.py new file mode 100644 index 0000000..8600f8c --- /dev/null +++ b/parakeet/models/fastspeech/FFTBlock.py @@ -0,0 +1,36 @@ +import numpy as np +import math +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward + +class FFTBlock(dg.Layer): + def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): + super(FFTBlock, self).__init__() + self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False) + self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) + + def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): + """ + Feed Forward Transformer block in FastSpeech. + + Args: + enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input. + T means the timesteps of input. + non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence. + slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention. + len_q means the sequence length of query, len_k means the sequence length of key. + + Returns: + output (Variable), Shape(B, T, C), the output after self-attention & ffn. + slf_attn (Variable), Shape(B * n_head, T, T), the self attention. + """ + output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) + output *= non_pad_mask + + output = self.pos_ffn(output) + output *= non_pad_mask + + return output, slf_attn \ No newline at end of file diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/LengthRegulator.py similarity index 75% rename from parakeet/models/fastspeech/modules.py rename to parakeet/models/fastspeech/LengthRegulator.py index 68d4776..2c72594 100644 --- a/parakeet/models/fastspeech/modules.py +++ b/parakeet/models/fastspeech/LengthRegulator.py @@ -1,42 +1,10 @@ import numpy as np import math -import utils +import parakeet.models.fastspeech.utils import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers import paddle.fluid as fluid from parakeet.modules.layers import Conv, Linear -from parakeet.modules.multihead_attention import MultiheadAttention -from parakeet.modules.feed_forward import PositionwiseFeedForward - -class FFTBlock(dg.Layer): - def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): - super(FFTBlock, self).__init__() - self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False) - self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) - - def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): - """ - Feed Forward Transformer block in FastSpeech. - - Args: - enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input. - T means the timesteps of input. - non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence. - slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention. - len_q means the sequence length of query, len_k means the sequence length of key. - - Returns: - output (Variable), Shape(B, T, C), the output after self-attention & ffn. - slf_attn (Variable), Shape(B * n_head, T, T), the self attention. - """ - output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) - output *= non_pad_mask - - output = self.pos_ffn(output) - output *= non_pad_mask - - return output, slf_attn - class LengthRegulator(dg.Layer): def __init__(self, input_size, out_channels, filter_size, dropout=0.1): diff --git a/parakeet/models/fastspeech/decoder.py b/parakeet/models/fastspeech/decoder.py new file mode 100644 index 0000000..273b88d --- /dev/null +++ b/parakeet/models/fastspeech/decoder.py @@ -0,0 +1,63 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +from parakeet.g2p.text.symbols import symbols +from parakeet.modules.utils import * +from parakeet.modules.post_convnet import PostConvNet +from parakeet.modules.layers import Linear +from parakeet.models.fastspeech.FFTBlock import FFTBlock + +class Decoder(dg.Layer): + def __init__(self, + len_max_seq, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Decoder, self).__init__() + + n_position = len_max_seq + 1 + self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_model], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, enc_seq, enc_pos): + """ + Decoder layer of FastSpeech. + + Args: + enc_seq (Variable), Shape(B, text_T, C), dtype: float32. + The output of length regulator. + enc_pos (Variable, optional): Shape(B, T_mel), + dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum. + Returns: + dec_output (Variable), Shape(B, mel_T, C), the decoder output. + dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. + """ + dec_slf_attn_list = [] + + # -- Prepare masks + slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) + non_pad_mask = get_non_pad_mask(enc_pos) + + # -- Forward + dec_output = enc_seq + self.position_enc(enc_pos) + + for dec_layer in self.layer_stack: + dec_output, dec_slf_attn = dec_layer( + dec_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + dec_slf_attn_list += [dec_slf_attn] + + return dec_output, dec_slf_attn_list \ No newline at end of file diff --git a/parakeet/models/fastspeech/encoder.py b/parakeet/models/fastspeech/encoder.py new file mode 100644 index 0000000..21d502f --- /dev/null +++ b/parakeet/models/fastspeech/encoder.py @@ -0,0 +1,67 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +from parakeet.g2p.text.symbols import symbols +from parakeet.modules.utils import * +from parakeet.modules.post_convnet import PostConvNet +from parakeet.modules.layers import Linear +from parakeet.models.fastspeech.FFTBlock import FFTBlock + +class Encoder(dg.Layer): + def __init__(self, + n_src_vocab, + len_max_seq, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Encoder, self).__init__() + n_position = len_max_seq + 1 + + self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0) + self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_model], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, character, text_pos): + """ + Encoder layer of FastSpeech. + + Args: + character (Variable): Shape(B, T_text), dtype: float32. The input text + characters. T_text means the timesteps of input characters. + text_pos (Variable): Shape(B, T_text), dtype: int64. The input text + position. T_text means the timesteps of input characters. + + Returns: + enc_output (Variable), Shape(B, text_T, C), the encoder output. + non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad. + enc_slf_attn_list (list), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. + """ + enc_slf_attn_list = [] + # -- prepare masks + # shape character (N, T) + slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) + non_pad_mask = get_non_pad_mask(character) + + # -- Forward + enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) + + for enc_layer in self.layer_stack: + enc_output, enc_slf_attn = enc_layer( + enc_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + enc_slf_attn_list += [enc_slf_attn] + + return enc_output, non_pad_mask, enc_slf_attn_list \ No newline at end of file diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/fastspeech.py similarity index 50% rename from parakeet/models/fastspeech/network.py rename to parakeet/models/fastspeech/fastspeech.py index f1a1e91..da7fc31 100644 --- a/parakeet/models/fastspeech/network.py +++ b/parakeet/models/fastspeech/fastspeech.py @@ -4,124 +4,10 @@ from parakeet.g2p.text.symbols import symbols from parakeet.modules.utils import * from parakeet.modules.post_convnet import PostConvNet from parakeet.modules.layers import Linear -from utils import * -from modules import FFTBlock, LengthRegulator - -class Encoder(dg.Layer): - def __init__(self, - n_src_vocab, - len_max_seq, - n_layers, - n_head, - d_k, - d_v, - d_model, - d_inner, - fft_conv1d_kernel, - fft_conv1d_padding, - dropout=0.1): - super(Encoder, self).__init__() - n_position = len_max_seq + 1 - - self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0) - self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) - self.position_enc = dg.Embedding(size=[n_position, d_model], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] - for i, layer in enumerate(self.layer_stack): - self.add_sublayer('fft_{}'.format(i), layer) - - def forward(self, character, text_pos): - """ - Encoder layer of FastSpeech. - - Args: - character (Variable): Shape(B, T_text), dtype: float32. The input text - characters. T_text means the timesteps of input characters. - text_pos (Variable): Shape(B, T_text), dtype: int64. The input text - position. T_text means the timesteps of input characters. - - Returns: - enc_output (Variable), Shape(B, text_T, C), the encoder output. - non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad. - enc_slf_attn_list (list), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. - """ - enc_slf_attn_list = [] - # -- prepare masks - # shape character (N, T) - slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) - non_pad_mask = get_non_pad_mask(character) - - # -- Forward - enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) - - for enc_layer in self.layer_stack: - enc_output, enc_slf_attn = enc_layer( - enc_output, - non_pad_mask=non_pad_mask, - slf_attn_mask=slf_attn_mask) - enc_slf_attn_list += [enc_slf_attn] - - return enc_output, non_pad_mask, enc_slf_attn_list - -class Decoder(dg.Layer): - def __init__(self, - len_max_seq, - n_layers, - n_head, - d_k, - d_v, - d_model, - d_inner, - fft_conv1d_kernel, - fft_conv1d_padding, - dropout=0.1): - super(Decoder, self).__init__() - - n_position = len_max_seq + 1 - self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) - self.position_enc = dg.Embedding(size=[n_position, d_model], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] - for i, layer in enumerate(self.layer_stack): - self.add_sublayer('fft_{}'.format(i), layer) - - def forward(self, enc_seq, enc_pos): - """ - Decoder layer of FastSpeech. - - Args: - enc_seq (Variable), Shape(B, text_T, C), dtype: float32. - The output of length regulator. - enc_pos (Variable, optional): Shape(B, T_mel), - dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum. - Returns: - dec_output (Variable), Shape(B, mel_T, C), the decoder output. - dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. - """ - dec_slf_attn_list = [] - - # -- Prepare masks - slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) - non_pad_mask = get_non_pad_mask(enc_pos) - - # -- Forward - dec_output = enc_seq + self.position_enc(enc_pos) - - for dec_layer in self.layer_stack: - dec_output, dec_slf_attn = dec_layer( - dec_output, - non_pad_mask=non_pad_mask, - slf_attn_mask=slf_attn_mask) - dec_slf_attn_list += [dec_slf_attn] - - return dec_output, dec_slf_attn_list +from parakeet.models.fastspeech.utils import * +from parakeet.models.fastspeech.LengthRegulator import LengthRegulator +from parakeet.models.fastspeech.encoder import Encoder +from parakeet.models.fastspeech.decoder import Decoder class FastSpeech(dg.Layer): def __init__(self, cfg): diff --git a/parakeet/models/transformerTTS/layers.py b/parakeet/models/transformerTTS/layers.py deleted file mode 100644 index 7a8e97e..0000000 --- a/parakeet/models/transformerTTS/layers.py +++ /dev/null @@ -1,166 +0,0 @@ -import math -import numpy as np - -import paddle -from paddle import fluid -import paddle.fluid.dygraph as dg - - -class Conv1D(dg.Layer): - """ - A convolution 1D block implemented with Conv2D. Form simplicity and - ensuring the output has the same length as the input, it does not allow - stride > 1. - """ - - def __init__(self, - in_channels, - num_filters, - filter_size=3, - padding=0, - dilation=1, - stride=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - data_format='NCT', - dtype="float32"): - super(Conv1D, self).__init__(dtype=dtype) - - self.padding = padding - self.in_channels = in_channels - self.num_filters = num_filters - self.filter_size = filter_size - self.stride = stride - self.dilation = dilation - self.padding = padding - self.act = act - self.data_format = data_format - - self.conv = dg.Conv2D( - in_channels=in_channels, - num_filters=num_filters, - filter_size=(1, filter_size), - stride=(1, stride), - dilation=(1, dilation), - padding=(0, padding), - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - - def forward(self, x): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input, where C_in means - input channels. - Returns: - x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means - output channels (num_filters). - """ - if self.data_format == 'NTC': - x = fluid.layers.transpose(x, [0, 2, 1]) - x = fluid.layers.unsqueeze(x, [2]) - x = self.conv(x) - x = fluid.layers.squeeze(x, [2]) - if self.data_format == 'NTC': - x = fluid.layers.transpose(x, [0, 2, 1]) - return x - -class Pool1D(dg.Layer): - """ - A Pool 1D block implemented with Pool2D. - """ - def __init__(self, - pool_size=-1, - pool_type='max', - pool_stride=1, - pool_padding=0, - global_pooling=False, - use_cudnn=True, - ceil_mode=False, - exclusive=True, - data_format='NCT', - dtype='float32'): - super(Pool1D, self).__init__(dtype=dtype) - self.pool_size = pool_size - self.pool_type = pool_type - self.pool_stride = pool_stride - self.pool_padding = pool_padding - self.global_pooling = global_pooling - self.use_cudnn = use_cudnn - self.ceil_mode = ceil_mode - self.exclusive = exclusive - self.data_format = data_format - self.dtype = dtype - - - self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, - pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], - global_pooling = global_pooling, use_cudnn = use_cudnn, - ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) - - - def forward(self, x): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input, where C_in means - input channels. - Returns: - x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means - output channels (num_filters). - """ - if self.data_format == 'NTC': - x = fluid.layers.transpose(x, [0, 2, 1]) - x = fluid.layers.unsqueeze(x, [2]) - x = self.pool2d(x) - x = fluid.layers.squeeze(x, [2]) - if self.data_format == 'NTC': - x = fluid.layers.transpose(x, [0, 2, 1]) - return x - -class DynamicGRU(dg.Layer): - def __init__(self, - size, - param_attr=None, - bias_attr=None, - is_reverse=False, - gate_activation='sigmoid', - candidate_activation='tanh', - h_0=None, - origin_mode=False, - init_size=None): - super(DynamicGRU, self).__init__() - self.gru_unit = dg.GRUUnit( - size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode) - self.size = size - self.h_0 = h_0 - self.is_reverse = is_reverse - - def forward(self, inputs): - hidden = self.h_0 - res = [] - for i in range(inputs.shape[1]): - if self.is_reverse: - i = inputs.shape[1] - 1 - i - input_ = inputs[:, i:i + 1, :] - input_ = fluid.layers.reshape( - input_, [-1, input_.shape[2]], inplace=False) - hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False) - res.append(hidden_) - if self.is_reverse: - res = res[::-1] - res = fluid.layers.concat(res, axis=1) - return res - diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py deleted file mode 100644 index e8eb8b6..0000000 --- a/parakeet/models/transformerTTS/module.py +++ /dev/null @@ -1,218 +0,0 @@ -import math -from parakeet.g2p.text.symbols import symbols -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -import paddle.fluid.layers as layers -from parakeet.modules.layers import Conv, Pool1D, Linear -from parakeet.modules.dynamicGRU import DynamicGRU -import numpy as np - - -class EncoderPrenet(dg.Layer): - def __init__(self, embedding_size, num_hidden, use_cudnn=True): - super(EncoderPrenet, self).__init__() - self.embedding_size = embedding_size - self.num_hidden = num_hidden - self.use_cudnn = use_cudnn - self.embedding = dg.Embedding( size = [len(symbols), embedding_size], - padding_idx = None) - self.conv_list = [] - self.conv_list.append(Conv(in_channels = embedding_size, - out_channels = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - use_cudnn = use_cudnn, - data_format = "NCT")) - for _ in range(2): - self.conv_list.append(Conv(in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - use_cudnn = use_cudnn, - data_format = "NCT")) - - for i, layer in enumerate(self.conv_list): - self.add_sublayer("conv_list_{}".format(i), layer) - - self.batch_norm_list = [dg.BatchNorm(num_hidden, - data_layout='NCHW') for _ in range(3)] - - for i, layer in enumerate(self.batch_norm_list): - self.add_sublayer("batch_norm_list_{}".format(i), layer) - - self.projection = Linear(num_hidden, num_hidden) - - def forward(self, x): - x = self.embedding(x) #(batch_size, seq_len, embending_size) - x = layers.transpose(x,[0,2,1]) - for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): - x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) - x = layers.transpose(x,[0,2,1]) #(N,T,C) - x = self.projection(x) - - return x - -class CBHG(dg.Layer): - def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, - max_pool_kernel_size=2, is_post=False): - super(CBHG, self).__init__() - """ - :param hidden_size: dimension of hidden unit - :param batch_size: batch size - :param K: # of convolution banks - :param projection_size: dimension of projection unit - :param num_gru_layers: # of layers of GRUcell - :param max_pool_kernel_size: max pooling kernel size - :param is_post: whether post processing or not - """ - self.hidden_size = hidden_size - self.projection_size = projection_size - self.conv_list = [] - self.conv_list.append(Conv(in_channels = projection_size, - out_channels = hidden_size, - filter_size = 1, - padding = int(np.floor(1/2)), - data_format = "NCT")) - for i in range(2,K+1): - self.conv_list.append(Conv(in_channels = hidden_size, - out_channels = hidden_size, - filter_size = i, - padding = int(np.floor(i/2)), - data_format = "NCT")) - - for i, layer in enumerate(self.conv_list): - self.add_sublayer("conv_list_{}".format(i), layer) - - self.batchnorm_list = [] - for i in range(K): - self.batchnorm_list.append(dg.BatchNorm(hidden_size, - data_layout='NCHW')) - - for i, layer in enumerate(self.batchnorm_list): - self.add_sublayer("batchnorm_list_{}".format(i), layer) - - conv_outdim = hidden_size * K - - self.conv_projection_1 = Conv(in_channels = conv_outdim, - out_channels = hidden_size, - filter_size = 3, - padding = int(np.floor(3/2)), - data_format = "NCT") - - self.conv_projection_2 = Conv(in_channels = hidden_size, - out_channels = projection_size, - filter_size = 3, - padding = int(np.floor(3/2)), - data_format = "NCT") - - self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, - data_layout='NCHW') - self.batchnorm_proj_2 = dg.BatchNorm(projection_size, - data_layout='NCHW') - self.max_pool = Pool1D(pool_size = max_pool_kernel_size, - pool_type='max', - pool_stride=1, - pool_padding=1, - data_format = "NCT") - self.highway = Highwaynet(self.projection_size) - - h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") - h_0 = dg.to_variable(h_0) - self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3) - self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3) - self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, - is_reverse = False, - origin_mode = True, - h_0 = h_0) - self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, - is_reverse=True, - origin_mode=True, - h_0 = h_0) - - self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3) - self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3) - self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, - is_reverse = False, - origin_mode = True, - h_0 = h_0) - self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, - is_reverse=True, - origin_mode=True, - h_0 = h_0) - - def _conv_fit_dim(self, x, filter_size=3): - if filter_size % 2 == 0: - return x[:,:,:-1] - else: - return x - - def forward(self, input_): - # input_.shape = [N, C, T] - - conv_list = [] - conv_input = input_ - - for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): - conv_input = self._conv_fit_dim(conv(conv_input), i+1) - conv_input = layers.relu(batchnorm(conv_input)) - conv_list.append(conv_input) - - conv_cat = layers.concat(conv_list, axis=1) - conv_pool = self.max_pool(conv_cat)[:,:,:-1] - - - conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) - conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ - - # conv_proj.shape = [N, C, T] - highway = layers.transpose(conv_proj, [0,2,1]) - highway = self.highway(highway) - - # highway.shape = [N, T, C] - fc_forward = self.fc_forward1(highway) - fc_reverse = self.fc_reverse1(highway) - out_forward = self.gru_forward1(fc_forward) - out_reverse = self.gru_reverse1(fc_reverse) - out = layers.concat([out_forward, out_reverse], axis=-1) - fc_forward = self.fc_forward2(out) - fc_reverse = self.fc_reverse2(out) - out_forward = self.gru_forward2(fc_forward) - out_reverse = self.gru_reverse2(fc_reverse) - out = layers.concat([out_forward, out_reverse], axis=-1) - out = layers.transpose(out, [0,2,1]) - return out - -class Highwaynet(dg.Layer): - def __init__(self, num_units, num_layers=4): - super(Highwaynet, self).__init__() - self.num_units = num_units - self.num_layers = num_layers - - self.gates = [] - self.linears = [] - - for i in range(num_layers): - self.linears.append(Linear(num_units, num_units)) - self.gates.append(Linear(num_units, num_units)) - - for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): - self.add_sublayer("linears_{}".format(i), linear) - self.add_sublayer("gates_{}".format(i), gate) - - def forward(self, input_): - out = input_ - - for linear, gate in zip(self.linears, self.gates): - h = fluid.layers.relu(linear(out)) - t_ = fluid.layers.sigmoid(gate(out)) - - c = 1 - t_ - out = h * t_ + out * c - - return out - - - - - - diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py deleted file mode 100644 index 5bbe115..0000000 --- a/parakeet/models/transformerTTS/network.py +++ /dev/null @@ -1,203 +0,0 @@ -from parakeet.models.transformerTTS.module import * -import paddle.fluid.dygraph as dg -import paddle.fluid as fluid -from parakeet.modules.layers import Conv1D, Linear -from parakeet.modules.utils import * -from parakeet.modules.multihead_attention import MultiheadAttention -from parakeet.modules.feed_forward import PositionwiseFeedForward -from parakeet.modules.prenet import PreNet -from parakeet.modules.post_convnet import PostConvNet - - -class Encoder(dg.Layer): - def __init__(self, embedding_size, num_hidden, config, num_head=4): - super(Encoder, self).__init__() - self.num_hidden = num_hidden - param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) - self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') - self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(size=[1024, num_hidden], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, - num_hidden = num_hidden, - use_cudnn=config.use_gpu) - self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] - for i, layer in enumerate(self.layers): - self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] - for i, layer in enumerate(self.ffns): - self.add_sublayer("ffns_{}".format(i), layer) - - def forward(self, x, positional): - if fluid.framework._dygraph_tracer()._train_mode: - query_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask(positional, x) - else: - query_mask, mask = None, None - - # Encoder pre_network - x = self.encoder_prenet(x) #(N,T,C) - - - # Get positional encoding - positional = self.pos_emb(positional) - - x = positional * self.alpha + x #(N, T, C) - - - # Positional dropout - x = layers.dropout(x, 0.1) - - # Self attention encoder - attentions = list() - for layer, ffn in zip(self.layers, self.ffns): - x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) - x = ffn(x) - attentions.append(attention) - - return x, query_mask, attentions - -class Decoder(dg.Layer): - def __init__(self, num_hidden, config, num_head=4): - super(Decoder, self).__init__() - self.num_hidden = num_hidden - param = fluid.ParamAttr() - self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', - default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) - self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(size=[1024, num_hidden], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.decoder_prenet = PreNet(input_size = config.audio.num_mels, - hidden_size = num_hidden * 2, - output_size = num_hidden, - dropout_rate=0.2) - self.linear = Linear(num_hidden, num_hidden) - - self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] - for i, layer in enumerate(self.selfattn_layers): - self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] - for i, layer in enumerate(self.attn_layers): - self.add_sublayer("attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] - for i, layer in enumerate(self.ffns): - self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) - self.stop_linear = Linear(num_hidden, 1) - - self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, - filter_size = 5, padding = 4, num_conv=5, - outputs_per_step=config.audio.outputs_per_step, - use_cudnn = config.use_gpu) - - def forward(self, key, value, query, c_mask, positional): - - # get decoder mask with triangular matrix - - if fluid.framework._dygraph_tracer()._train_mode: - m_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query) - triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) - mask = mask + triu_tensor - mask = fluid.layers.cast(mask == 0, np.float32) - - # (batch_size, decoder_len, encoder_len) - zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) - else: - mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) - m_mask, zero_mask = None, None - - # Decoder pre-network - query = self.decoder_prenet(query) - - # Centered position - query = self.linear(query) - - # Get position embedding - positional = self.pos_emb(positional) - query = positional * self.alpha + query - - #positional dropout - query = fluid.layers.dropout(query, 0.1) - - # Attention decoder-decoder, encoder-decoder - selfattn_list = list() - attn_list = list() - - for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): - query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) - query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) - query = ffn(query) - selfattn_list.append(attn_dec) - attn_list.append(attn_dot) - # Mel linear projection - mel_out = self.mel_linear(query) - # Post Mel Network - out = self.postconvnet(mel_out) - out = mel_out + out - - # Stop tokens - stop_tokens = self.stop_linear(query) - stop_tokens = layers.squeeze(stop_tokens, [-1]) - stop_tokens = layers.sigmoid(stop_tokens) - - return mel_out, out, attn_list, stop_tokens, selfattn_list - -class TransformerTTS(dg.Layer): - def __init__(self, config): - super(TransformerTTS, self).__init__() - self.encoder = Encoder(config.embedding_size, config.hidden_size, config) - self.decoder = Decoder(config.hidden_size, config) - self.config = config - - def forward(self, characters, mel_input, pos_text, pos_mel): - # key (batch_size, seq_len, channel) - # c_mask (batch_size, seq_len) - # attns_enc (channel / 2, seq_len, seq_len) - - key, c_mask, attns_enc = self.encoder(characters, pos_text) - - # mel_output/postnet_output (batch_size, mel_len, n_mel) - # attn_probs (128, mel_len, seq_len) - # stop_preds (batch_size, mel_len, 1) - # attns_dec (128, mel_len, mel_len) - mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel) - - return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec - -class ModelPostNet(dg.Layer): - """ - CBHG Network (mel -> linear) - """ - def __init__(self, config): - super(ModelPostNet, self).__init__() - self.pre_proj = Conv1D(in_channels = config.audio.num_mels, - out_channels = config.hidden_size, - filter_size=1, - data_format = "NCT") - self.cbhg = CBHG(config.hidden_size, config.batch_size) - self.post_proj = Conv1D(in_channels = config.hidden_size, - out_channels = (config.audio.n_fft // 2) + 1, - filter_size=1, - data_format = "NCT") - - def forward(self, mel): - mel = layers.transpose(mel, [0,2,1]) - mel = self.pre_proj(mel) - mel = self.cbhg(mel) - mag_pred = self.post_proj(mel) - mag_pred = layers.transpose(mag_pred, [0,2,1]) - return mag_pred - - - - - -