diff --git a/parakeet/models/dataloader/__init__.py b/parakeet/models/dataloader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/dataloader/jlspeech.py b/parakeet/models/dataloader/jlspeech.py new file mode 100644 index 0000000..7f39bfb --- /dev/null +++ b/parakeet/models/dataloader/jlspeech.py @@ -0,0 +1,148 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa + +from paddle import fluid +from parakeet import g2p +from parakeet import audio +from parakeet.data.sampler import * +from parakeet.data.datacargo import DataCargo +from parakeet.data.dataset import Dataset +from parakeet.data.batch import TextIDBatcher, SpecBatcher + +class LJSpeechLoader: + def __init__(self, config, nranks, rank, is_vocoder=False): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + LJSPEECH_ROOT = Path(config.data_path) + dataset = LJSpeech(LJSPEECH_ROOT, config) + sampler = DistributedSampler(len(dataset), nranks, rank) + + assert config.batch_size % nranks == 0 + each_bs = config.batch_size // nranks + if is_vocoder: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True) + else: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) + + self.reader = fluid.io.DataLoader.from_generator( + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + self.reader.set_batch_generator(dataloader, place) + + +class LJSpeech(Dataset): + def __init__(self, root, config): + super(LJSpeech, self).__init__() + assert isinstance(root, (str, Path)), "root should be a string or Path object" + self.root = root if isinstance(root, Path) else Path(root) + self.metadata = self._prepare_metadata() + self.config = config + + def _prepare_metadata(self): + csv_path = self.root.joinpath("metadata.csv") + metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, + names=["fname", "raw_text", "normalized_text"]) + return metadata + + def _get_example(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + + fname, raw_text, normalized_text = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + _ljspeech_processor = audio.AudioProcessor( + sample_rate=22050, + num_mels=80, + min_level_db=-100, + ref_level_db=20, + n_fft=2048, + win_length= int(22050 * 0.05), + hop_length= int(22050 * 0.0125), + power=1.2, + preemphasis=0.97, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = _ljspeech_processor.load_wav(str(wav_path)) + mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __len__(self): + return len(self.metadata) + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) + +def batch_examples_vocoder(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + + + diff --git a/parakeet/models/fastspeech/__init__.py b/parakeet/models/fastspeech/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/fastspeech/config/fastapeech.yaml b/parakeet/models/fastspeech/config/fastapeech.yaml new file mode 100644 index 0000000..3e62846 --- /dev/null +++ b/parakeet/models/fastspeech/config/fastapeech.yaml @@ -0,0 +1,41 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +encoder_output_size: 384 +word_vec_dim: 384 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +decoder_output_size: 384 +d_model: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +image_step: 2000 +use_gpu: False +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +transtts_path: ./checkpoint +transformer_step: 70000 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/parakeet/models/fastspeech/config/fastspeech.yaml new file mode 100644 index 0000000..947457b --- /dev/null +++ b/parakeet/models/fastspeech/config/fastspeech.yaml @@ -0,0 +1,43 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +encoder_output_size: 384 +embedding_size: 384 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +decoder_output_size: 384 +hidden_size: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 +dropout: 0.1 +transformer_head: 4 + +warm_up_step: 4000 +grad_clip_thresh: 0.1 +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +transtts_path: ../transformerTTS/checkpoint +transformer_step: 20 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/dataset.py b/parakeet/models/fastspeech/dataset.py new file mode 100644 index 0000000..b3ee344 --- /dev/null +++ b/parakeet/models/fastspeech/dataset.py @@ -0,0 +1,124 @@ +import torch +from torch.nn import functional as F +from torch.utils.data import Dataset, DataLoader + +import numpy as np +import math +import os + +import hparams +import Audio +from text import text_to_sequence +from utils import process_text, pad_1D, pad_2D + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +class FastSpeechDataset(Dataset): + """ LJSpeech """ + + def __init__(self): + self.text = process_text(os.path.join("data", "train.txt")) + + def __len__(self): + return len(self.text) + + def __getitem__(self, idx): + mel_gt_name = os.path.join( + hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1)) + mel_gt_target = np.load(mel_gt_name) + D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy")) + + character = self.text[idx][0:len(self.text[idx])-1] + character = np.array(text_to_sequence( + character, hparams.text_cleaners)) + + sample = {"text": character, + "mel_target": mel_gt_target, + "D": D} + + return sample + + +def reprocess(batch, cut_list): + texts = [batch[ind]["text"] for ind in cut_list] + mel_targets = [batch[ind]["mel_target"] for ind in cut_list] + Ds = [batch[ind]["D"] for ind in cut_list] + + length_text = np.array([]) + for text in texts: + length_text = np.append(length_text, text.shape[0]) + + src_pos = list() + max_len = int(max(length_text)) + for length_src_row in length_text: + src_pos.append(np.pad([i+1 for i in range(int(length_src_row))], + (0, max_len-int(length_src_row)), 'constant')) + src_pos = np.array(src_pos) + + length_mel = np.array(list()) + for mel in mel_targets: + length_mel = np.append(length_mel, mel.shape[0]) + + mel_pos = list() + max_mel_len = int(max(length_mel)) + for length_mel_row in length_mel: + mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))], + (0, max_mel_len-int(length_mel_row)), 'constant')) + mel_pos = np.array(mel_pos) + + texts = pad_1D(texts) + Ds = pad_1D(Ds) + mel_targets = pad_2D(mel_targets) + + out = {"text": texts, + "mel_target": mel_targets, + "D": Ds, + "mel_pos": mel_pos, + "src_pos": src_pos, + "mel_max_len": max_mel_len} + + return out + + +def collate_fn(batch): + len_arr = np.array([d["text"].shape[0] for d in batch]) + index_arr = np.argsort(-len_arr) + batchsize = len(batch) + real_batchsize = int(math.sqrt(batchsize)) + + cut_list = list() + for i in range(real_batchsize): + cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize]) + + output = list() + for i in range(real_batchsize): + output.append(reprocess(batch, cut_list[i])) + + return output + + +if __name__ == "__main__": + # Test + dataset = FastSpeechDataset() + training_loader = DataLoader(dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_fn, + drop_last=True, + num_workers=0) + total_step = hparams.epochs * len(training_loader) * hparams.batch_size + + cnt = 0 + for i, batchs in enumerate(training_loader): + for j, data_of_batch in enumerate(batchs): + mel_target = torch.from_numpy( + data_of_batch["mel_target"]).float().to(device) + D = torch.from_numpy(data_of_batch["D"]).int().to(device) + # print(mel_target.size()) + # print(D.sum()) + print(cnt) + if mel_target.size(1) == D.sum().item(): + cnt += 1 + + print(cnt) diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py new file mode 100644 index 0000000..6c09f41 --- /dev/null +++ b/parakeet/models/fastspeech/modules.py @@ -0,0 +1,117 @@ +import numpy as np +import math +import utils +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parakeet.modules.layers import Conv1D +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward + + + +class FFTBlock(dg.Layer): + """FFT Block""" + def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): + super(FFTBlock, self).__init__() + self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout) + self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) + + def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): + enc_output, enc_slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) + enc_output *= non_pad_mask + + enc_output = self.pos_ffn(enc_output) + enc_output *= non_pad_mask + + return enc_output, enc_slf_attn + + +class LengthRegulator(dg.Layer): + def __init__(self, input_size, out_channels, filter_size, dropout=0.1): + super(LengthRegulator, self).__init__() + self.duration_predictor = DurationPredictor(input_size=input_size, + out_channels=out_channels, + filter_size=filter_size, + dropout=dropout) + + def LR(self, x, duration_predictor_output, alpha=1.0): + output = [] + batch_size = x.shape[0] + for i in range(batch_size): + output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha)) + output = self.pad(output) + return output + + def pad(self, input_ele): + max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) + out_list = [] + for i in range(len(input_ele)): + pad_len = max_len - input_ele[i].shape[0] + one_batch_padded = layers.pad( + input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) + out_list.append(one_batch_padded) + out_padded = layers.stack(out_list) + return out_padded + + def expand(self, batch, predicted, alpha): + out = [] + time_steps = batch.shape[1] + fertilities = predicted.numpy() + batch = layers.squeeze(batch,[0]) + + + for i in range(time_steps): + if fertilities[0,i]==0: + continue + out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1])) + out = layers.concat(out, axis=0) + return out + + + def forward(self, x, alpha=1.0, target=None): + duration_predictor_output = self.duration_predictor(x) + if fluid.framework._dygraph_tracer()._train_mode: + output = self.LR(x, target) + return output, duration_predictor_output + else: + duration_predictor_output = layers.round(duration_predictor_output) + output = self.LR(x, duration_predictor_output, alpha) + mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])]) + return output, mel_pos + +class DurationPredictor(dg.Layer): + """ Duration Predictor """ + def __init__(self, input_size, out_channels, filter_size, dropout=0.1): + super(DurationPredictor, self).__init__() + self.input_size = input_size + self.out_channels = out_channels + self.filter_size = filter_size + self.dropout = dropout + + self.conv1 = Conv1D(in_channels = self.input_size, + out_channels = self.out_channels, + filter_size = self.filter_size, + padding=1, + data_format='NTC') + self.conv2 = Conv1D(in_channels = self.out_channels, + out_channels = self.out_channels, + filter_size = self.filter_size, + padding=1, + data_format='NTC') + self.layer_norm1 = dg.LayerNorm(self.out_channels) + self.layer_norm2 = dg.LayerNorm(self.out_channels) + + self.linear =dg.Linear(self.out_channels, 1) + + def forward(self, encoder_output): + + # encoder_output.shape(N, T, C) + out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout) + out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout) + out = layers.relu(self.linear(out)) + out = layers.squeeze(out, axes=[-1]) + + return out + + diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py new file mode 100644 index 0000000..3f00263 --- /dev/null +++ b/parakeet/models/fastspeech/network.py @@ -0,0 +1,163 @@ +from utils import * +from modules import * +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +from parakeet.g2p.text.symbols import symbols +from parakeet.modules.utils import * +from parakeet.modules.post_convnet import PostConvNet + +class Encoder(dg.Layer): + def __init__(self, + n_src_vocab, + len_max_seq, + d_word_vec, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Encoder, self).__init__() + n_position = len_max_seq + 1 + + self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0) + self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, character, text_pos): + enc_slf_attn_list = [] + # -- prepare masks + # shape character (N, T) + slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) + non_pad_mask = get_non_pad_mask(character) + + # -- Forward + enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) + + for enc_layer in self.layer_stack: + enc_output, enc_slf_attn = enc_layer( + enc_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + enc_slf_attn_list += [enc_slf_attn] + + return enc_output, non_pad_mask, enc_slf_attn_list + +class Decoder(dg.Layer): + def __init__(self, + len_max_seq, + d_word_vec, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Decoder, self).__init__() + + n_position = len_max_seq + 1 + self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, enc_seq, enc_pos): + dec_slf_attn_list = [] + + # -- Prepare masks + slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) + non_pad_mask = get_non_pad_mask(enc_pos) + + # -- Forward + dec_output = enc_seq + self.position_enc(enc_pos) + + for dec_layer in self.layer_stack: + dec_output, dec_slf_attn = dec_layer( + dec_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + dec_slf_attn_list += [dec_slf_attn] + + return dec_output, dec_slf_attn_list + +class FastSpeech(dg.Layer): + def __init__(self, cfg): + " FastSpeech" + super(FastSpeech, self).__init__() + + self.encoder = Encoder(n_src_vocab=len(symbols)+1, + len_max_seq=cfg.max_sep_len, + d_word_vec=cfg.embedding_size, + n_layers=cfg.encoder_n_layer, + n_head=cfg.encoder_head, + d_k=64, + d_v=64, + d_model=cfg.hidden_size, + d_inner=cfg.encoder_conv1d_filter_size, + fft_conv1d_kernel=cfg.fft_conv1d_filter, + fft_conv1d_padding=cfg.fft_conv1d_padding, + dropout=0.1) + self.length_regulator = LengthRegulator(input_size=cfg.hidden_size, + out_channels=cfg.duration_predictor_output_size, + filter_size=cfg.duration_predictor_filter_size, + dropout=cfg.dropout) + self.decoder = Decoder(len_max_seq=cfg.max_sep_len, + d_word_vec=cfg.embedding_size, + n_layers=cfg.decoder_n_layer, + n_head=cfg.decoder_head, + d_k=64, + d_v=64, + d_model=cfg.hidden_size, + d_inner=cfg.decoder_conv1d_filter_size, + fft_conv1d_kernel=cfg.fft_conv1d_filter, + fft_conv1d_padding=cfg.fft_conv1d_padding, + dropout=0.1) + self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels) + self.postnet = PostConvNet(n_mels=80, + num_hidden=512, + filter_size=5, + padding=int(5 / 2), + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1) + + def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): + encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) + if fluid.framework._dygraph_tracer()._train_mode: + + length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, + target=length_target, + alpha=alpha) + decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos) + + mel_output = self.mel_linear(decoder_output) + mel_output_postnet = self.postnet(mel_output) + mel_output + + return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list + else: + length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) + decoder_output = self.decoder(length_regulator_output, decoder_pos) + + mel_output = self.mel_linear(decoder_output) + mel_output_postnet = self.postnet(mel_output) + mel_output + + return mel_output, mel_output_postnet \ No newline at end of file diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py new file mode 100644 index 0000000..a6a8b2f --- /dev/null +++ b/parakeet/models/fastspeech/parse.py @@ -0,0 +1,93 @@ +import jsonargparse + +def add_config_options_to_parser(parser): + parser.add_argument('--audio.num_mels', type=int, default=80, + help="the number of mel bands when calculating mel spectrograms.") + parser.add_argument('--audio.n_fft', type=int, default=2048, + help="the number of fft components.") + parser.add_argument('--audio.sr', type=int, default=22050, + help="the sampling rate of audio data file.") + parser.add_argument('--audio.preemphasis', type=float, default=0.97, + help="the preemphasis coefficient.") + parser.add_argument('--audio.hop_length', type=float, default=128, + help="the number of samples to advance between frames.") + parser.add_argument('--audio.win_length', type=float, default=1024, + help="the length (width) of the window function.") + parser.add_argument('--audio.power', type=float, default=1.4, + help="the power to raise before griffin-lim.") + parser.add_argument('--audio.min_level_db', type=int, default=-100, + help="the minimum level db.") + parser.add_argument('--audio.ref_level_db', type=int, default=20, + help="the reference level db.") + parser.add_argument('--audio.outputs_per_step', type=int, default=1, + help="the outputs per step.") + + parser.add_argument('--embedding_size', type=int, default=256, + help="the dim size of embedding.") + parser.add_argument('--encoder_n_layer', type=int, default=6, + help="the number of FFT Block in encoder.") + parser.add_argument('--encoder_head', type=int, default=2, + help="the attention head number in encoder.") + parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024, + help="the filter size of conv1d in encoder.") + parser.add_argument('--max_sep_len', type=int, default=2048, + help="the max length of sequence.") + parser.add_argument('--encoder_output_size', type=int, default=256, + help="the output channel size of encoder.") + parser.add_argument('--decoder_n_layer', type=int, default=6, + help="the number of FFT Block in decoder.") + parser.add_argument('--decoder_head', type=int, default=2, + help="the attention head number in decoder.") + parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, + help="the filter size of conv1d in decoder.") + parser.add_argument('--decoder_output_size', type=int, default=256, + help="the output channel size of decoder.") + parser.add_argument('--hidden_size', type=int, default=256, + help="the hidden size in model.") + parser.add_argument('--duration_predictor_output_size', type=int, default=256, + help="the output size of duration predictior.") + parser.add_argument('--duration_predictor_filter_size', type=int, default=3, + help="the filter size of conv1d in duration prediction.") + parser.add_argument('--fft_conv1d_filter', type=int, default=3, + help="the filter size of conv1d in fft.") + parser.add_argument('--fft_conv1d_padding', type=int, default=1, + help="the padding size of conv1d in fft.") + parser.add_argument('--dropout', type=float, default=0.1, + help="the dropout in network.") + parser.add_argument('--transformer_head', type=int, default=4, + help="the attention head num of transformerTTS.") + + parser.add_argument('--warm_up_step', type=int, default=4000, + help="the warm up step of learning rate.") + parser.add_argument('--grad_clip_thresh', type=float, default=1.0, + help="the threshold of grad clip.") + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--use_gpu', type=bool, default=True, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=bool, default=False, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./sample', + help="the directory to save audio sample in synthesis.") + parser.add_argument('--transtts_path', type=str, default='./log', + help="the directory to load pretrain transformerTTS model.") + parser.add_argument('--transformer_step', type=int, default=70000, + help="the step to load transformerTTS model.") + + + parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/parakeet/models/fastspeech/train.py b/parakeet/models/fastspeech/train.py new file mode 100644 index 0000000..2af299d --- /dev/null +++ b/parakeet/models/fastspeech/train.py @@ -0,0 +1,139 @@ +import numpy as np +import argparse +import os +import time +import math +import jsonargparse +from pathlib import Path +from tqdm import tqdm +from tensorboardX import SummaryWriter +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parse import add_config_options_to_parser +from pprint import pprint +from network import FastSpeech +from utils import get_alignment +from parakeet.models.dataloader.jlspeech import LJSpeechLoader +from parakeet.models.transformerTTS.network import TransformerTTS + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + +def main(cfg): + + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'fastspeech') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + transformerTTS = TransformerTTS(cfg) + model_path = os.path.join(cfg.transtts_path, "transformer") + model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) + #for param in transformerTTS.state_dict(): + # print(param) + + transformerTTS.set_dict(model_dict) + transformerTTS.eval() + + model = FastSpeech(cfg) + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step)) + + reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + pbar = tqdm(reader) + + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + character, mel, mel_input, pos_text, pos_mel, text_length = data + + _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) + alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32) + + global_step += 1 + + #Forward + result= model(character, + pos_text, + mel_pos=pos_mel, + length_target=alignment) + mel_output, mel_output_postnet, duration_predictor_output, _, _ = result + mel_loss = layers.mse_loss(mel_output, mel) + mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) + duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment))) + total_loss = mel_loss + mel_postnet_loss + duration_loss + + if local_rank==0: + print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) + + writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) + writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) + writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + + if cfg.use_data_parallel: + total_loss = model.scale_loss(total_loss) + total_loss.backward() + model.apply_collective_grads() + else: + total_loss.backward() + optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + model.clear_gradients() + + # save checkpoint + if local_rank==0 and global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() + + +if __name__ =='__main__': + parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) + main(cfg) \ No newline at end of file diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py new file mode 100644 index 0000000..7517a13 --- /dev/null +++ b/parakeet/models/fastspeech/utils.py @@ -0,0 +1,32 @@ +import numpy as np + +def get_alignment(attn_probs, n_head): + max_F = 0 + assert attn_probs[0].shape[0] % n_head == 0 + batch_size = int(attn_probs[0].shape[0] // n_head) + for i in range(len(attn_probs)): + multi_attn = attn_probs[i].numpy() + for j in range(n_head): + attn = multi_attn[j*batch_size:(j+1)*batch_size] + F = score_F(attn) + if max_F < F: + max_F = F + max_attn = attn + alignment = compute_duration(max_attn) + return alignment + +def score_F(attn): + max = np.max(attn, axis=-1) + mean = np.mean(max) + return mean + +def compute_duration(attn): + alignment = np.zeros([attn.shape[0],attn.shape[2]]) + for i in range(attn.shape[0]): + for j in range(attn.shape[1]): + max_index = attn[i,j].tolist().index(attn[i,j].max()) + alignment[i,max_index] += 1 + + return alignment + + diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml index 5753ab1..7937c5e 100644 --- a/parakeet/models/transformerTTS/config/train_postnet.yaml +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -10,9 +10,8 @@ audio: ref_level_db: 20 outputs_per_step: 1 -network: - hidden_size: 256 - embedding_size: 512 +hidden_size: 256 +embedding_size: 512 batch_size: 32 diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml index 3e56a4f..038848b 100644 --- a/parakeet/models/transformerTTS/config/train_transformer.yaml +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -10,15 +10,15 @@ audio: ref_level_db: 20 outputs_per_step: 1 -network: - hidden_size: 256 - embedding_size: 512 + +hidden_size: 384 #256 +embedding_size: 384 #512 batch_size: 32 epochs: 10000 lr: 0.001 -save_step: 500 +save_step: 10 image_step: 2000 use_gpu: True use_data_parallel: True diff --git a/parakeet/models/transformerTTS/data.py b/parakeet/models/transformerTTS/data.py index f432640..8fa9182 100644 --- a/parakeet/models/transformerTTS/data.py +++ b/parakeet/models/transformerTTS/data.py @@ -3,10 +3,10 @@ import numpy as np from paddle import fluid from parakeet.data.sampler import DistributedSampler from parakeet.data.datacargo import DataCargo -from preprocess import batch_examples, LJSpeech, batch_examples_postnet +from preprocess import batch_examples, LJSpeech, batch_examples_vocoder class LJSpeechLoader: - def __init__(self, config, nranks, rank, is_postnet=False): + def __init__(self, config, nranks, rank, is_vocoder=False): place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() LJSPEECH_ROOT = Path(config.data_path) @@ -15,8 +15,8 @@ class LJSpeechLoader: assert config.batch_size % nranks == 0 each_bs = config.batch_size // nranks - if is_postnet: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True) + if is_vocoder: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples_vocoder, drop_last=True) else: dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=True, collate_fn=batch_examples, drop_last=True) diff --git a/parakeet/models/transformerTTS/layers.py b/parakeet/models/transformerTTS/layers.py index 88f110f..7a8e97e 100644 --- a/parakeet/models/transformerTTS/layers.py +++ b/parakeet/models/transformerTTS/layers.py @@ -14,7 +14,6 @@ class Conv1D(dg.Layer): """ def __init__(self, - name_scope, in_channels, num_filters, filter_size=3, @@ -28,7 +27,7 @@ class Conv1D(dg.Layer): act=None, data_format='NCT', dtype="float32"): - super(Conv1D, self).__init__(name_scope, dtype=dtype) + super(Conv1D, self).__init__(dtype=dtype) self.padding = padding self.in_channels = in_channels @@ -41,7 +40,7 @@ class Conv1D(dg.Layer): self.data_format = data_format self.conv = dg.Conv2D( - self.full_name(), + in_channels=in_channels, num_filters=num_filters, filter_size=(1, filter_size), stride=(1, stride), @@ -77,7 +76,6 @@ class Pool1D(dg.Layer): A Pool 1D block implemented with Pool2D. """ def __init__(self, - name_scope, pool_size=-1, pool_type='max', pool_stride=1, @@ -88,7 +86,7 @@ class Pool1D(dg.Layer): exclusive=True, data_format='NCT', dtype='float32'): - super(Pool1D, self).__init__(name_scope, dtype=dtype) + super(Pool1D, self).__init__(dtype=dtype) self.pool_size = pool_size self.pool_type = pool_type self.pool_stride = pool_stride @@ -101,7 +99,7 @@ class Pool1D(dg.Layer): self.dtype = dtype - self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type, + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], global_pooling = global_pooling, use_cudnn = use_cudnn, ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) @@ -127,7 +125,6 @@ class Pool1D(dg.Layer): class DynamicGRU(dg.Layer): def __init__(self, - scope_name, size, param_attr=None, bias_attr=None, @@ -137,9 +134,8 @@ class DynamicGRU(dg.Layer): h_0=None, origin_mode=False, init_size=None): - super(DynamicGRU, self).__init__(scope_name) + super(DynamicGRU, self).__init__() self.gru_unit = dg.GRUUnit( - self.full_name(), size * 3, param_attr=param_attr, bias_attr=bias_attr, diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index f83bff5..8e003da 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -3,339 +3,63 @@ from parakeet.g2p.text.symbols import symbols import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers -from layers import Conv1D, Pool1D, DynamicGRU +from parakeet.modules.layers import Conv1D, Pool1D +from parakeet.modules.dynamicGRU import DynamicGRU import numpy as np -class FC(dg.Layer): - def __init__(self, name_scope, in_features, out_features, is_bias=True, dtype="float32", gain=1): - super(FC, self).__init__(name_scope) - self.in_features = in_features - self.out_features = out_features - self.is_bias = is_bias - self.dtype = dtype - self.gain = gain - - self.weight = self.create_parameter(fluid.ParamAttr(name='weight'), shape=(in_features, out_features), - dtype=dtype, - default_initializer = fluid.initializer.XavierInitializer()) - #self.weight = gain * self.weight - # mind the implicit conversion to ParamAttr for many cases - if is_bias is not False: - k = math.sqrt(1 / in_features) - self.bias = self.create_parameter(fluid.ParamAttr(name='bias'), shape=(out_features, ), - is_bias=True, - dtype=dtype, - default_initializer = fluid.initializer.Uniform(low=-k, high=k)) - - # 默认初始化权重使用 Xavier 的方法,偏置使用均匀分布,范围是(-\sqrt{k},/sqrt{k}),k=1/infeature - - def forward(self, x): - x = fluid.layers.matmul(x, self.weight) - if hasattr(self, "bias"): - x = fluid.layers.elementwise_add(x, self.bias) - return x - -class Conv(dg.Layer): - def __init__(self, name_scope, in_channels, out_channels, filter_size=1, - padding=0, dilation=1, stride=1, use_cudnn=True, - data_format="NCT", is_bias=True, gain=1): - super(Conv, self).__init__(name_scope) - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_size = filter_size - self.padding = padding - self.dilation = dilation - self.stride = stride - self.use_cudnn = use_cudnn - self.data_format = data_format - self.is_bias = is_bias - self.gain = gain - - self.weight_attr = fluid.ParamAttr(name='weight', initializer=fluid.initializer.XavierInitializer()) - self.bias_attr = None - if is_bias is not False: - k = math.sqrt(1 / in_channels) - self.bias_attr = fluid.ParamAttr(name='bias', initializer=fluid.initializer.Uniform(low=-k, high=k)) - - self.conv = Conv1D( self.full_name(), - in_channels = in_channels, - num_filters = out_channels, - filter_size = filter_size, - padding = padding, - dilation = dilation, - stride = stride, - param_attr = self.weight_attr, - bias_attr = self.bias_attr, - use_cudnn = use_cudnn, - data_format = data_format) - - def forward(self, x): - x = self.conv(x) - return x class EncoderPrenet(dg.Layer): - def __init__(self, name_scope, embedding_size, num_hidden, use_cudnn=True): - super(EncoderPrenet, self).__init__(name_scope) + def __init__(self, embedding_size, num_hidden, use_cudnn=True): + super(EncoderPrenet, self).__init__() self.embedding_size = embedding_size self.num_hidden = num_hidden self.use_cudnn = use_cudnn - self.embedding = dg.Embedding(self.full_name(), - size = [len(symbols), embedding_size], + self.embedding = dg.Embedding( size = [len(symbols), embedding_size], param_attr = fluid.ParamAttr(name='weight'), padding_idx = None) - self.conv1 = Conv(self.full_name(), - in_channels = embedding_size, + self.conv_list = [] + self.conv_list.append(Conv1D(in_channels = embedding_size, out_channels = num_hidden, filter_size = 5, padding = int(np.floor(5/2)), use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.conv2 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.conv3 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - - self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') - self.batch_norm2 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') - self.batch_norm3 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') + data_format = "NCT")) + for _ in range(2): + self.conv_list = Conv1D(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT") - self.projection = FC(self.full_name(), num_hidden, num_hidden) - - def forward(self, x): - x = self.embedding(x) #(batch_size, seq_len, embending_size) - x = layers.transpose(x,[0,2,1]) - x = layers.dropout(layers.relu(self.batch_norm1(self.conv1(x))), 0.2) - x = layers.dropout(layers.relu(self.batch_norm2(self.conv2(x))), 0.2) - x = layers.dropout(layers.relu(self.batch_norm3(self.conv3(x))), 0.2) - x = layers.transpose(x,[0,2,1]) #(N,T,C) - x = self.projection(x) - return x - -class FFN(dg.Layer): - def __init__(self, name_scope, num_hidden, use_cudnn=True): - super(FFN, self).__init__(name_scope) - self.num_hidden = num_hidden - self.use_cudnn = use_cudnn - self.w_1 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden * 4, - filter_size = 1, - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.w_2 = Conv(self.full_name(), - in_channels = num_hidden * 4, - out_channels = num_hidden, - filter_size = 1, - use_cudnn = use_cudnn, - data_format = "NCT", - gain = math.sqrt(2)) - self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) - - def forward(self, input): - #FFN Networt - x = layers.transpose(input, [0,2,1]) - x = self.w_2(layers.relu(self.w_1(x))) - x = layers.transpose(x,[0,2,1]) - - # dropout - # x = layers.dropout(x, 0.1) - # not sure where dropout should be placed, in paper should before residual, - # but the diagonal alignment did not appear correctly in the attention plot. - - # residual connection - x = x + input - - - #layer normalization - x = self.layer_norm(x) - - return x - -class DecoderPrenet(dg.Layer): - def __init__(self, name_scope, input_size, hidden_size, output_size, dropout_rate=0.5): - super(DecoderPrenet, self).__init__(name_scope) - self.input_size = input_size - self.hidden_size = hidden_size - self.output_size = output_size - self.dropout_rate = dropout_rate - - self.fc1 = FC(self.full_name(), input_size, hidden_size) #in pytorch this gian=1 - self.fc2 = FC(self.full_name(), hidden_size, output_size) - - def forward(self, x): - x = layers.dropout(layers.relu(self.fc1(x)), self.dropout_rate) - x = layers.dropout(layers.relu(self.fc2(x)), self.dropout_rate) - return x - -class ScaledDotProductAttention(dg.Layer): - def __init__(self, name_scope, d_key): - super(ScaledDotProductAttention, self).__init__(name_scope) - - self.d_key = d_key - - # please attention this mask is diff from pytorch - def forward(self, key, value, query, mask=None, query_mask=None): - # Compute attention score - attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y - attention = attention / math.sqrt(self.d_key) - - # Mask key to ignore padding - if mask is not None: - attention = attention * mask - mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) - attention = attention + mask - - - attention = layers.softmax(attention) - # Mask query to ignore padding - # Not sure how to work - if query_mask is not None: - attention = attention * query_mask - - result = layers.matmul(attention, value) - return result, attention - -class MultiheadAttention(dg.Layer): - def __init__(self, name_scope, num_hidden, num_head=4): - super(MultiheadAttention, self).__init__(name_scope) - self.num_hidden = num_hidden - self.num_hidden_per_attn = num_hidden // num_head - self.num_head = num_head - - self.key = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) - self.value = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) - self.query = FC(self.full_name(), num_hidden, num_hidden, is_bias=False) - - self.scal_attn = ScaledDotProductAttention(self.full_name(), self.num_hidden_per_attn) - - self.fc = FC(self.full_name(), num_hidden * 2, num_hidden) - - self.layer_norm = dg.LayerNorm(self.full_name(), begin_norm_axis=2) - - def forward(self, key, value, query_input, mask=None, query_mask=None): - batch_size = key.shape[0] - seq_len_key = key.shape[1] - seq_len_query = query_input.shape[1] - - # repeat masks h times - if query_mask is not None: - query_mask = layers.unsqueeze(query_mask, axes=[-1]) - query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) - if mask is not None: - mask = layers.expand(mask, (self.num_head, 1, 1)) - - # Make multihead attention - # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) - key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) - value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.num_hidden_per_attn]) - query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.num_hidden_per_attn]) - - key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) - value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.num_hidden_per_attn]) - query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.num_hidden_per_attn]) - - result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) - - # concat all multihead result - result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.num_hidden_per_attn]) - result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) - #print(result.().shape) - # concat result with input - result = layers.concat([query_input, result], axis=-1) - - result = self.fc(result) - result = result + query_input - - result = self.layer_norm(result) - return result, attention - -class PostConvNet(dg.Layer): - def __init__(self, name_scope, config): - super(PostConvNet, self).__init__(name_scope) - - num_hidden = config.network.hidden_size - self.num_hidden = num_hidden - self.conv1 = Conv(self.full_name(), - in_channels = config.audio.num_mels * config.audio.outputs_per_step, - out_channels = num_hidden, - filter_size = 5, - padding = 4, - use_cudnn = config.use_gpu, - data_format = "NCT", - gain = 5 / 3) - self.conv_list = [Conv(self.full_name(), - in_channels = num_hidden, - out_channels = num_hidden, - filter_size = 5, - padding = 4, - use_cudnn = config.use_gpu, - data_format = "NCT", - gain = 5 / 3) for _ in range(3)] for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) - self.conv5 = Conv(self.full_name(), - in_channels = num_hidden, - out_channels = config.audio.num_mels * config.audio.outputs_per_step, - filter_size = 5, - padding = 4, - use_cudnn = config.use_gpu, - data_format = "NCT") - self.batch_norm_list = [dg.BatchNorm(self.full_name(), num_hidden, + self.batch_norm_list = [dg.BatchNorm(num_hidden, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', moving_variance_name = 'moving_var', data_layout='NCHW') for _ in range(3)] + for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) - self.batch_norm1 = dg.BatchNorm(self.full_name(), num_hidden, - param_attr = fluid.ParamAttr(name='weight'), - bias_attr = fluid.ParamAttr(name='bias'), - moving_mean_name = 'moving_mean', - moving_variance_name = 'moving_var', - data_layout='NCHW') - def forward(self, input): - input = layers.dropout(layers.tanh(self.batch_norm1(self.conv1(input)[:, :, :-4])),0.1) + self.projection = dg.Linear(num_hidden, num_hidden) + + def forward(self, x): + x = self.embedding(x) #(batch_size, seq_len, embending_size) + x = layers.transpose(x,[0,2,1]) for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): - input = layers.dropout(layers.tanh(batch_norm(conv(input)[:, :, :-4])),0.1) - input = self.conv5(input)[:, :, :-4] - return input + x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) + x = layers.transpose(x,[0,2,1]) #(N,T,C) + x = self.projection(x) + return x class CBHG(dg.Layer): - def __init__(self, name_scope, config, K=16, projection_size = 256, num_gru_layers=2, + def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, max_pool_kernel_size=2, is_post=False): - super(CBHG, self).__init__(name_scope) + super(CBHG, self).__init__() """ :param hidden_size: dimension of hidden unit :param K: # of convolution banks @@ -344,19 +68,16 @@ class CBHG(dg.Layer): :param max_pool_kernel_size: max pooling kernel size :param is_post: whether post processing or not """ - hidden_size = config.network.hidden_size self.hidden_size = hidden_size self.projection_size = projection_size self.conv_list = [] - self.conv_list.append(Conv(self.full_name(), - in_channels = projection_size, + self.conv_list.append(Conv1D(in_channels = projection_size, out_channels = hidden_size, filter_size = 1, padding = int(np.floor(1/2)), data_format = "NCT")) for i in range(2,K+1): - self.conv_list.append(Conv(self.full_name(), - in_channels = hidden_size, + self.conv_list.append(Conv1D(in_channels = hidden_size, out_channels = hidden_size, filter_size = i, padding = int(np.floor(i/2)), @@ -367,7 +88,7 @@ class CBHG(dg.Layer): self.batchnorm_list = [] for i in range(K): - self.batchnorm_list.append(dg.BatchNorm(self.full_name(), hidden_size, + self.batchnorm_list.append(dg.BatchNorm(hidden_size, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', @@ -379,69 +100,63 @@ class CBHG(dg.Layer): conv_outdim = hidden_size * K - self.conv_projection_1 = Conv(self.full_name(), - in_channels = conv_outdim, + self.conv_projection_1 = Conv1D(in_channels = conv_outdim, out_channels = hidden_size, filter_size = 3, padding = int(np.floor(3/2)), data_format = "NCT") - self.conv_projection_2 = Conv(self.full_name(), - in_channels = hidden_size, + self.conv_projection_2 = Conv1D(in_channels = hidden_size, out_channels = projection_size, filter_size = 3, padding = int(np.floor(3/2)), data_format = "NCT") - self.batchnorm_proj_1 = dg.BatchNorm(self.full_name(), hidden_size, + self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', moving_variance_name = 'moving_var', data_layout='NCHW') - self.batchnorm_proj_2 = dg.BatchNorm(self.full_name(), projection_size, + self.batchnorm_proj_2 = dg.BatchNorm(projection_size, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), moving_mean_name = 'moving_mean', moving_variance_name = 'moving_var', data_layout='NCHW') - self.max_pool = Pool1D(self.full_name(), pool_size = max_pool_kernel_size, + self.max_pool = Pool1D(pool_size = max_pool_kernel_size, pool_type='max', pool_stride=1, pool_padding=1, data_format = "NCT") - self.highway = Highwaynet(self.full_name(), self.projection_size) + self.highway = Highwaynet(self.projection_size) - h_0 = np.zeros((config.batch_size, hidden_size // 2), dtype="float32") + h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = dg.to_variable(h_0) - self.fc_forward1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.fc_reverse1 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.gru_forward1 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse = False, origin_mode = True, h_0 = h_0) - self.gru_reverse1 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse=True, origin_mode=True, h_0 = h_0) - self.fc_forward2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.fc_reverse2 = FC(self.full_name(), hidden_size, hidden_size // 2 * 3) - self.gru_forward2 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse = False, origin_mode = True, h_0 = h_0) - self.gru_reverse2 = DynamicGRU(self.full_name(), - size = self.hidden_size // 2, + self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, param_attr = fluid.ParamAttr(name='weight'), bias_attr = fluid.ParamAttr(name='bias'), is_reverse=True, @@ -491,8 +206,8 @@ class CBHG(dg.Layer): return out class Highwaynet(dg.Layer): - def __init__(self, name_scope, num_units, num_layers=4): - super(Highwaynet, self).__init__(name_scope) + def __init__(self, num_units, num_layers=4): + super(Highwaynet, self).__init__() self.num_units = num_units self.num_layers = num_layers @@ -500,8 +215,8 @@ class Highwaynet(dg.Layer): self.linears = [] for i in range(num_layers): - self.linears.append(FC(self.full_name(), num_units, num_units)) - self.gates.append(FC(self.full_name(), num_units, num_units)) + self.linears.append(dg.Linear(num_units, num_units)) + self.gates.append(dg.Linear(num_units, num_units)) for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): self.add_sublayer("linears_{}".format(i), linear) diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py index 3d356dc..0536f68 100644 --- a/parakeet/models/transformerTTS/network.py +++ b/parakeet/models/transformerTTS/network.py @@ -1,39 +1,42 @@ -from module import * -from utils import get_positional_table, get_sinusoid_encoding_table +from parakeet.models.transformerTTS.module import * import paddle.fluid.dygraph as dg import paddle.fluid as fluid +from parakeet.modules.layers import Conv1D +from parakeet.modules.utils import * +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward +from parakeet.modules.prenet import PreNet +from parakeet.modules.post_convnet import PostConvNet + class Encoder(dg.Layer): - def __init__(self, name_scope, embedding_size, num_hidden, config): - super(Encoder, self).__init__(name_scope) + def __init__(self, embedding_size, num_hidden, config): + super(Encoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr(name='alpha', initializer=fluid.initializer.Constant(value=1.0)) - self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32') + self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(name_scope=self.full_name(), - size=[1024, num_hidden], + self.pos_emb = dg.Embedding(size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( name='weight', initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) - self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(), - embedding_size = embedding_size, + self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, num_hidden = num_hidden, use_cudnn=config.use_gpu) - self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + self.layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) def forward(self, x, positional): if fluid.framework._dygraph_tracer()._train_mode: - query_mask = (positional != 0).astype(np.float32) - mask = (positional != 0).astype(np.float32) - mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1]) + query_mask = get_non_pad_mask(positional) + mask = get_attn_key_pad_mask(positional, x) else: query_mask, mask = None, None @@ -59,65 +62,60 @@ class Encoder(dg.Layer): return x, query_mask, attentions class Decoder(dg.Layer): - def __init__(self, name_scope, num_hidden, config): - super(Decoder, self).__init__(name_scope) + def __init__(self, num_hidden, config): + super(Decoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr(name='alpha') - self.alpha = self.create_parameter(param, shape=(1,), dtype='float32', + self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(name_scope=self.full_name(), - size=[1024, num_hidden], + self.pos_emb = dg.Embedding(size=[1024, num_hidden], padding_idx=0, param_attr=fluid.ParamAttr( name='weight', initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) - self.decoder_prenet = DecoderPrenet(self.full_name(), - input_size = config.audio.num_mels, + self.decoder_prenet = PreNet(input_size = config.audio.num_mels, hidden_size = num_hidden * 2, output_size = num_hidden, dropout_rate=0.2) - self.linear = FC(self.full_name(), num_hidden, num_hidden) + self.linear = dg.Linear(num_hidden, num_hidden) - self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] for i, layer in enumerate(self.selfattn_layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)] + self.attn_layers = [MultiheadAttention(num_hidden, num_hidden, num_hidden) for _ in range(3)] for i, layer in enumerate(self.attn_layers): self.add_sublayer("attn_{}".format(i), layer) - self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step) - self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1) + self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) + self.stop_linear = dg.Linear(num_hidden, 1) - self.postconvnet = PostConvNet(self.full_name(), config) + self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, + filter_size = 5, padding = 4, num_conv=5, + outputs_per_step=config.audio.outputs_per_step, + use_cudnn = config.use_gpu) def forward(self, key, value, query, c_mask, positional): - batch_size = key.shape[0] - decoder_len = query.shape[1] # get decoder mask with triangular matrix if fluid.framework._dygraph_tracer()._train_mode: - #zeros = np.zeros(positional.shape, dtype=np.float32) - m_mask = (positional != 0).astype(np.float32) - mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1) - mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + m_mask = get_non_pad_mask(positional) + mask = get_attn_key_pad_mask(positional, query) + triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) + mask = mask + triu_tensor + mask = fluid.layers.cast(mask != 0, np.float32) - # (batch_size, decoder_len, decoder_len) - zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(np.float32), axes=2), [1,1,decoder_len]) - # (batch_size, decoder_len, seq_len) - zero_mask = fluid.layers.transpose(zero_mask, [0,2,1]) - + # (batch_size, decoder_len, encoder_len) + zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) else: - mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) + mask = fluid.layers.cast(dg.to_variable(mask != 0), np.float32) m_mask, zero_mask = None, None - #import pdb; pdb.set_trace() # Decoder pre-network query = self.decoder_prenet(query) @@ -145,21 +143,21 @@ class Decoder(dg.Layer): # Mel linear projection mel_out = self.mel_linear(query) # Post Mel Network - postnet_input = layers.transpose(mel_out, [0,2,1]) - out = self.postconvnet(postnet_input) - out = postnet_input + out - out = layers.transpose(out, [0,2,1]) + out = self.postconvnet(mel_out) + out = mel_out + out # Stop tokens stop_tokens = self.stop_linear(query) + stop_tokens = layers.squeeze(stop_tokens, [-1]) + stop_tokens = layers.sigmoid(stop_tokens) return mel_out, out, attn_list, stop_tokens, selfattn_list -class Model(dg.Layer): - def __init__(self, name_scope, config): - super(Model, self).__init__(name_scope) - self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config) - self.decoder = Decoder(self.full_name(), config.network.hidden_size, config) +class TransformerTTS(dg.Layer): + def __init__(self, config): + super(TransformerTTS, self).__init__() + self.encoder = Encoder(config.embedding_size, config.hidden_size, config) + self.decoder = Decoder(config.hidden_size, config) self.config = config def forward(self, characters, mel_input, pos_text, pos_mel): @@ -180,16 +178,16 @@ class ModelPostNet(dg.Layer): """ CBHG Network (mel -> linear) """ - def __init__(self, name_scope, config): - super(ModelPostNet, self).__init__(name_scope) - self.pre_proj = Conv(self.full_name(), - in_channels = config.audio.num_mels, - out_channels = config.network.hidden_size, + def __init__(self, config): + super(ModelPostNet, self).__init__() + self.pre_proj = Conv1D(in_channels = config.audio.num_mels, + out_channels = config.hidden_size, + filter_size=1, data_format = "NCT") - self.cbhg = CBHG(self.full_name(), config) - self.post_proj = Conv(self.full_name(), - in_channels = config.audio.num_mels, + self.cbhg = CBHG(config.hidden_size, config.batch_size) + self.post_proj = Conv1D(in_channels = config.hidden_size, out_channels = (config.audio.n_fft // 2) + 1, + filter_size=1, data_format = "NCT") def forward(self, mel): diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py index 0c09d01..87a67e9 100644 --- a/parakeet/models/transformerTTS/parse.py +++ b/parakeet/models/transformerTTS/parse.py @@ -22,9 +22,9 @@ def add_config_options_to_parser(parser): parser.add_argument('--audio.outputs_per_step', type=int, default=1, help="the outputs per step.") - parser.add_argument('--network.hidden_size', type=int, default=256, + parser.add_argument('--hidden_size', type=int, default=256, help="the hidden size in network.") - parser.add_argument('--network.embedding_size', type=int, default=512, + parser.add_argument('--embedding_size', type=int, default=512, help="the embedding vector size.") parser.add_argument('--batch_size', type=int, default=32, diff --git a/parakeet/models/transformerTTS/preprocess.py b/parakeet/models/transformerTTS/preprocess.py index 61ed353..b128b00 100644 --- a/parakeet/models/transformerTTS/preprocess.py +++ b/parakeet/models/transformerTTS/preprocess.py @@ -62,20 +62,6 @@ class LJSpeech(Dataset): phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) return (mag, mel, phonemes) # maybe we need to implement it as a map in the future - def _batch_examples(self, minibatch): - mag_batch = [] - mel_batch = [] - phoneme_batch = [] - for example in minibatch: - mag, mel, phoneme = example - mag_batch.append(mag) - mel_batch.append(mel) - phoneme_batch.append(phoneme) - mag_batch = SpecBatcher(pad_value=0.)(mag_batch) - mel_batch = SpecBatcher(pad_value=0.)(mel_batch) - phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) - return (mag_batch, mel_batch, phoneme_batch) - def __getitem__(self, index): metadatum = self.metadata.iloc[index] example = self._get_example(metadatum) @@ -121,7 +107,7 @@ def batch_examples(batch): mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) -def batch_examples_postnet(batch): +def batch_examples_vocoder(batch): mels=[] mags=[] for data in batch: diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py index 13e0de0..9c89d16 100644 --- a/parakeet/models/transformerTTS/synthesis.py +++ b/parakeet/models/transformerTTS/synthesis.py @@ -28,8 +28,8 @@ def synthesis(text_input, cfg): writer = SummaryWriter(path) with dg.guard(place): - model = Model('transtts', cfg) - model_postnet = ModelPostNet('postnet', cfg) + model = Model(cfg) + model_postnet = ModelPostNet(cfg) model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py index 8beeece..2f893f2 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -47,7 +47,7 @@ def main(cfg): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - model = ModelPostNet('postnet', cfg) + model = ModelPostNet(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) @@ -62,7 +62,7 @@ def main(cfg): strategy = dg.parallel.prepare_context() model = MyDataParallel(model, strategy) - reader = LJSpeechLoader(cfg, nranks, local_rank, is_postnet=True).reader() + reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg.epochs): pbar = tqdm(reader) @@ -74,7 +74,6 @@ def main(cfg): global_step += 1 mag_pred = model(mel) - loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) if cfg.use_data_parallel: loss = model.scale_loss(loss) diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index 065be6d..fc522ae 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -9,7 +9,8 @@ import jsonargparse from parse import add_config_options_to_parser from pprint import pprint from matplotlib import cm -from data import LJSpeechLoader +from parakeet.modules.utils import cross_entropy +from parakeet.models.dataloader.jlspeech import LJSpeechLoader class MyDataParallel(dg.parallel.DataParallel): """ @@ -49,7 +50,7 @@ def main(cfg): writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - model = Model('transtts', cfg) + model = TransformerTTS(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000)) @@ -75,15 +76,22 @@ def main(cfg): global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) - + + label = np.zeros(stop_preds.shape).astype(np.float32) + text_length = text_length.numpy() + for i in range(label.shape[0]): + label[i][text_length[i] - 1] = 1 + mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) - loss = mel_loss + post_mel_loss + stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) + loss = mel_loss + post_mel_loss + stop_loss if local_rank==0: writer.add_scalars('training_loss', { 'mel_loss':mel_loss.numpy(), 'post_mel_loss':post_mel_loss.numpy(), + 'stop_loss':stop_loss.numpy() }, global_step) writer.add_scalars('alphas', { @@ -97,7 +105,7 @@ def main(cfg): for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): diff --git a/parakeet/modules/dynamicGRU.py b/parakeet/modules/dynamicGRU.py new file mode 100644 index 0000000..44a6e7f --- /dev/null +++ b/parakeet/modules/dynamicGRU.py @@ -0,0 +1,44 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class DynamicGRU(dg.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = dg.GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = layers.concat(res, axis=1) + return res + diff --git a/parakeet/modules/feed_forward.py b/parakeet/modules/feed_forward.py new file mode 100644 index 0000000..d197c6e --- /dev/null +++ b/parakeet/modules/feed_forward.py @@ -0,0 +1,40 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +from parakeet.modules.layers import Conv1D + +class PositionwiseFeedForward(dg.Layer): + ''' A two-feed-forward-layer module ''' + def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.dropout = dropout + + self.w_1 = Conv1D(in_channels = d_in, + out_channels = num_hidden, + filter_size = filter_size, + padding=padding, + use_cudnn = use_cudnn, + data_format = "NTC") + self.w_2 = Conv1D(in_channels = num_hidden, + out_channels = d_in, + filter_size = filter_size, + padding=padding, + use_cudnn = use_cudnn, + data_format = "NTC") + self.layer_norm = dg.LayerNorm(d_in) + + def forward(self, input): + #FFN Networt + x = self.w_2(layers.relu(self.w_1(input))) + + # dropout + x = layers.dropout(x, self.dropout) + + # residual connection + x = x + input + + #layer normalization + x = self.layer_norm(x) + + return x \ No newline at end of file diff --git a/parakeet/modules/layers.py b/parakeet/modules/layers.py new file mode 100644 index 0000000..c62f0b1 --- /dev/null +++ b/parakeet/modules/layers.py @@ -0,0 +1,122 @@ +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + in_channels, + out_channels, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = out_channels + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + num_channels=in_channels, + num_filters=out_channels, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT'): + super(Pool1D, self).__init__() + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + + + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py new file mode 100644 index 0000000..6b86e51 --- /dev/null +++ b/parakeet/modules/multihead_attention.py @@ -0,0 +1,84 @@ +import math +import numpy as np +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class ScaledDotProductAttention(dg.Layer): + def __init__(self, d_key): + super(ScaledDotProductAttention, self).__init__() + + self.d_key = d_key + + # please attention this mask is diff from pytorch + def forward(self, key, value, query, mask=None, query_mask=None): + # Compute attention score + attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y + attention = attention / math.sqrt(self.d_key) + + # Mask key to ignore padding + if mask is not None: + attention = attention * (mask == 0).astype(np.float32) + mask = mask * (-2 ** 32 + 1) + attention = attention + mask + + + attention = layers.softmax(attention) + attention = layers.dropout(attention, 0.0) + # Mask query to ignore padding + # Not sure how to work + if query_mask is not None: + attention = attention * query_mask + + result = layers.matmul(attention, value) + return result, attention + +class MultiheadAttention(dg.Layer): + def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1): + super(MultiheadAttention, self).__init__() + self.num_hidden = num_hidden + self.num_head = num_head + self.d_k = d_k + self.d_q = d_q + self.dropout = dropout + + self.key = dg.Linear(num_hidden, num_head * d_k) + self.value = dg.Linear(num_hidden, num_head * d_k) + self.query = dg.Linear(num_hidden, num_head * d_q) + + self.scal_attn = ScaledDotProductAttention(d_k) + + self.fc = dg.Linear(num_head * d_q, num_hidden) + + self.layer_norm = dg.LayerNorm(num_hidden) + + def forward(self, key, value, query_input, mask=None, query_mask=None): + batch_size = key.shape[0] + seq_len_key = key.shape[1] + seq_len_query = query_input.shape[1] + + # repeat masks h times + if query_mask is not None: + query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) + if mask is not None: + mask = layers.expand(mask, (self.num_head, 1, 1)) + + # Make multihead attention + # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) + key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) + value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k]) + query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q]) + + key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q]) + result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) + + # concat all multihead result + result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) + result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + + result = layers.dropout(self.fc(result), self.dropout) + result = result + query_input + + result = self.layer_norm(result) + return result, attention \ No newline at end of file diff --git a/parakeet/modules/post_convnet.py b/parakeet/modules/post_convnet.py new file mode 100644 index 0000000..fb7d531 --- /dev/null +++ b/parakeet/modules/post_convnet.py @@ -0,0 +1,67 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from parakeet.modules.layers import Conv1D + +class PostConvNet(dg.Layer): + def __init__(self, + n_mels=80, + num_hidden=512, + filter_size=5, + padding=0, + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1): + super(PostConvNet, self).__init__() + + self.dropout = dropout + self.conv_list = [] + self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT")) + + for _ in range(1, num_conv-1): + self.conv_list.append(Conv1D(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT") ) + + self.conv_list.append(Conv1D(in_channels = num_hidden, + out_channels = n_mels * outputs_per_step, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batch_norm_list = [dg.BatchNorm(num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') for _ in range(num_conv-1)] + self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW')) + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + + + def forward(self, input): + input = layers.transpose(input, [0,2,1]) + len = input.shape[-1] + for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) + input = layers.transpose(input, [0,2,1]) + return input \ No newline at end of file diff --git a/parakeet/modules/prenet.py b/parakeet/modules/prenet.py new file mode 100644 index 0000000..1f4249e --- /dev/null +++ b/parakeet/modules/prenet.py @@ -0,0 +1,26 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class PreNet(dg.Layer): + """ + Pre Net before passing through the network + """ + def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): + """ + :param input_size: dimension of input + :param hidden_size: dimension of hidden unit + :param output_size: dimension of output + """ + super(PreNet, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.dropout_rate = dropout_rate + + self.linear1 = dg.Linear(input_size, hidden_size) + self.linear2 = dg.Linear(hidden_size, output_size) + + def forward(self, x): + x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) + x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) + return x diff --git a/parakeet/models/transformerTTS/utils.py b/parakeet/modules/utils.py similarity index 58% rename from parakeet/models/transformerTTS/utils.py rename to parakeet/modules/utils.py index 087cacf..626d5f2 100644 --- a/parakeet/models/transformerTTS/utils.py +++ b/parakeet/modules/utils.py @@ -2,6 +2,7 @@ import numpy as np import librosa import os, copy from scipy import signal +import paddle.fluid.layers as layers def get_positional_table(d_pos_vec, n_position=1024): @@ -33,6 +34,28 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): return sinusoid_table +def get_non_pad_mask(seq): + return layers.unsqueeze((seq != 0).astype(np.float32),[-1]) + +def get_attn_key_pad_mask(seq_k, seq_q): + ''' For masking out the padding part of key sequence. ''' + + # Expand to fit the shape of key query attention matrix. + len_q = seq_q.shape[1] + padding_mask = (seq_k != 0).astype(np.float32) + padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) + return padding_mask + +def get_triu_tensor(seq_k, seq_q): + ''' For make a triu tensor ''' + len_k = seq_k.shape[1] + len_q = seq_q.shape[1] + batch_size = seq_k.shape[0] + triu_tensor = np.triu(np.ones([len_k, len_q]), 1) + triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0) + + return triu_tensor + def guided_attention(N, T, g=0.2): '''Guided attention. Refer to page 3 on the paper.''' W = np.zeros((N, T), dtype=np.float32) @@ -40,3 +63,11 @@ def guided_attention(N, T, g=0.2): for t_pos in range(W.shape[1]): W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) return W + + +def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001): + input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) + label = input * (label * (position_weight - 1) + 1) + return layers.reduce_sum(label, dim=[0, 1]) + +