diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index 6b84701..b861a39 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -209,7 +209,7 @@ class AudioProcessor(object): def inv_melspectrogram(self, mel_spectrogram): S = self._denormalize(mel_spectrogram) S = self._db_to_amplitude(S + self.ref_level_db) - S = self._linear_to_mel(np.abs(S)) + S = self._mel_to_linear(np.abs(S)) if self.preemphasis: return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self._griffin_lim(S ** self.power) diff --git a/parakeet/models/dataloader/jlspeech.py b/parakeet/models/dataloader/jlspeech.py deleted file mode 100644 index ef55b0f..0000000 --- a/parakeet/models/dataloader/jlspeech.py +++ /dev/null @@ -1,148 +0,0 @@ -from pathlib import Path -import numpy as np -import pandas as pd -import librosa - -from paddle import fluid -from parakeet import g2p -from parakeet import audio -from parakeet.data.sampler import * -from parakeet.data.datacargo import DataCargo -from parakeet.data.dataset import Dataset -from parakeet.data.batch import TextIDBatcher, SpecBatcher - -class LJSpeechLoader: - def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): - place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() - - LJSPEECH_ROOT = Path(config.data_path) - dataset = LJSpeech(LJSPEECH_ROOT, config) - sampler = DistributedSampler(len(dataset), nranks, rank, shuffle=shuffle) - - assert config.batch_size % nranks == 0 - each_bs = config.batch_size // nranks - if is_vocoder: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples_vocoder, drop_last=True) - else: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples, drop_last=True) - - self.reader = fluid.io.DataLoader.from_generator( - capacity=32, - iterable=True, - use_double_buffer=True, - return_list=True) - self.reader.set_batch_generator(dataloader, place) - - -class LJSpeech(Dataset): - def __init__(self, root, config): - super(LJSpeech, self).__init__() - assert isinstance(root, (str, Path)), "root should be a string or Path object" - self.root = root if isinstance(root, Path) else Path(root) - self.metadata = self._prepare_metadata() - self.config = config - self._ljspeech_processor = audio.AudioProcessor( - sample_rate=config.audio.sr, - num_mels=config.audio.num_mels, - min_level_db=config.audio.min_level_db, - ref_level_db=config.audio.ref_level_db, - n_fft=config.audio.n_fft, - win_length= config.audio.win_length, - hop_length= config.audio.hop_length, - power=config.audio.power, - preemphasis=config.audio.preemphasis, - signal_norm=True, - symmetric_norm=False, - max_norm=1., - mel_fmin=0, - mel_fmax=None, - clip_norm=True, - griffin_lim_iters=60, - do_trim_silence=False, - sound_norm=False) - - def _prepare_metadata(self): - csv_path = self.root.joinpath("metadata.csv") - metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, - names=["fname", "raw_text", "normalized_text"]) - return metadata - - def _get_example(self, metadatum): - """All the code for generating an Example from a metadatum. If you want a - different preprocessing pipeline, you can override this method. - This method may require several processor, each of which has a lot of options. - In this case, you'd better pass a composed transform and pass it to the init - method. - """ - - fname, raw_text, normalized_text = metadatum - wav_path = self.root.joinpath("wavs", fname + ".wav") - - # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize - wav = self._ljspeech_processor.load_wav(str(wav_path)) - mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) - mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) - phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) - return (mag, mel, phonemes) # maybe we need to implement it as a map in the future - - def __getitem__(self, index): - metadatum = self.metadata.iloc[index] - example = self._get_example(metadatum) - return example - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __len__(self): - return len(self.metadata) - - -def batch_examples(batch): - texts = [] - mels = [] - mel_inputs = [] - text_lens = [] - pos_texts = [] - pos_mels = [] - for data in batch: - _, mel, text = data - mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) - text_lens.append(len(text)) - pos_texts.append(np.arange(1, len(text) + 1)) - pos_mels.append(np.arange(1, mel.shape[1] + 1)) - mels.append(mel) - texts.append(text) - - # Sort by text_len in descending order - texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] - mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] - mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] - pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] - pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] - text_lens = sorted(text_lens, reverse=True) - - # Pad sequence with largest len of the batch - texts = TextIDBatcher(pad_id=0)(texts) #(B, T) - pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) - pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) - mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) - return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) - -def batch_examples_vocoder(batch): - mels=[] - mags=[] - for data in batch: - mag, mel, _ = data - mels.append(mel) - mags.append(mag) - - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) - mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) - - return (mels, mags) - - - - diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/parakeet/models/fastspeech/config/fastspeech.yaml index 2a58569..3e53388 100644 --- a/parakeet/models/fastspeech/config/fastspeech.yaml +++ b/parakeet/models/fastspeech/config/fastspeech.yaml @@ -39,5 +39,8 @@ use_data_parallel: False data_path: ../../../dataset/LJSpeech-1.1 transtts_path: ../transformerTTS/checkpoint/ -transformer_step: 10 -log_dir: ./log \ No newline at end of file +transformer_step: 200000 +save_path: ./checkpoint +log_dir: ./log +#checkpoint_path: ./checkpoint +#ransformer_step: 97000 \ No newline at end of file diff --git a/parakeet/models/fastspeech/config/synthesis.yaml b/parakeet/models/fastspeech/config/synthesis.yaml new file mode 100644 index 0000000..2841b2e --- /dev/null +++ b/parakeet/models/fastspeech/config/synthesis.yaml @@ -0,0 +1,33 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +fs_hidden_size: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 +dropout: 0.1 +transformer_head: 4 + +use_gpu: True +alpha: 1.0 + +checkpoint_path: checkpoint/ +fastspeech_step: 71000 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py index 633b2bb..7950728 100644 --- a/parakeet/models/fastspeech/modules.py +++ b/parakeet/models/fastspeech/modules.py @@ -102,7 +102,8 @@ class LengthRegulator(dg.Layer): else: duration_predictor_output = layers.round(duration_predictor_output) output = self.LR(x, duration_predictor_output, alpha) - mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])]) + mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1)) + mel_pos = layers.unsqueeze(mel_pos, [0]) return output, mel_pos class DurationPredictor(dg.Layer): diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py index dcb76c1..5005cba 100644 --- a/parakeet/models/fastspeech/network.py +++ b/parakeet/models/fastspeech/network.py @@ -1,11 +1,11 @@ -from utils import * -from modules import FFTBlock, LengthRegulator import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.g2p.text.symbols import symbols from parakeet.modules.utils import * from parakeet.modules.post_convnet import PostConvNet from parakeet.modules.layers import Linear +from utils import * +from modules import FFTBlock, LengthRegulator class Encoder(dg.Layer): def __init__(self, @@ -203,8 +203,7 @@ class FastSpeech(dg.Layer): return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list else: length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) - decoder_output = self.decoder(length_regulator_output, decoder_pos) - + decoder_output, _ = self.decoder(length_regulator_output, decoder_pos) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py index b43a8af..894d988 100644 --- a/parakeet/models/fastspeech/parse.py +++ b/parakeet/models/fastspeech/parse.py @@ -50,6 +50,9 @@ def add_config_options_to_parser(parser): help="the dropout in network.") parser.add_argument('--transformer_head', type=int, default=4, help="the attention head num of transformerTTS.") + parser.add_argument('--alpha', type=float, default=1.0, + help="the hyperparameter to determine the length of the expanded sequence\ + mel, thereby controlling the voice speed.") parser.add_argument('--hidden_size', type=int, default=256, help="the hidden size in model of transformerTTS.") @@ -68,6 +71,8 @@ def add_config_options_to_parser(parser): help="the learning rate for training.") parser.add_argument('--save_step', type=int, default=500, help="checkpointing interval during training.") + parser.add_argument('--fastspeech_step', type=int, default=160000, + help="Global step to restore checkpoint of fastspeech.") parser.add_argument('--use_gpu', type=bool, default=True, help="use gpu or not during training.") parser.add_argument('--use_data_parallel', type=bool, default=False, diff --git a/parakeet/models/fastspeech/synthesis.py b/parakeet/models/fastspeech/synthesis.py new file mode 100644 index 0000000..779af02 --- /dev/null +++ b/parakeet/models/fastspeech/synthesis.py @@ -0,0 +1,76 @@ +import os +from tensorboardX import SummaryWriter +from collections import OrderedDict +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.dygraph as dg +from parakeet.g2p.en import text_to_sequence +from parakeet import audio +from network import FastSpeech + +def load_checkpoint(step, model_path): + model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + new_state_dict = OrderedDict() + for param in model_dict: + if param.startswith('_layers.'): + new_state_dict[param[8:]] = model_dict[param] + else: + new_state_dict[param] = model_dict[param] + return new_state_dict + +def synthesis(text_input, cfg): + place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) + + # tensorboard + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'synthesis') + + writer = SummaryWriter(path) + + with dg.guard(place): + model = FastSpeech(cfg) + model.set_dict(load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech"))) + model.eval() + + text = np.asarray(text_to_sequence(text_input)) + text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) + pos_text = np.arange(1, text.shape[1]+1) + pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) + + mel_output, mel_output_postnet = model(text, pos_text, alpha=cfg.alpha) + + _ljspeech_processor = audio.AudioProcessor( + sample_rate=cfg.audio.sr, + num_mels=cfg.audio.num_mels, + min_level_db=cfg.audio.min_level_db, + ref_level_db=cfg.audio.ref_level_db, + n_fft=cfg.audio.n_fft, + win_length= cfg.audio.win_length, + hop_length= cfg.audio.hop_length, + power=cfg.audio.power, + preemphasis=cfg.audio.preemphasis, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + + mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) + wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) + writer.add_audio(text_input, wav, 0, cfg.audio.sr) + print("Synthesis completed !!!") + writer.close() + +if __name__ == '__main__': + parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) + synthesis("Transformer model is so fast!", cfg) \ No newline at end of file diff --git a/parakeet/models/fastspeech/train.py b/parakeet/models/fastspeech/train.py index 84b467a..c29bf6d 100644 --- a/parakeet/models/fastspeech/train.py +++ b/parakeet/models/fastspeech/train.py @@ -5,34 +5,28 @@ import time import math import jsonargparse from pathlib import Path +from parse import add_config_options_to_parser +from pprint import pprint from tqdm import tqdm +from collections import OrderedDict from tensorboardX import SummaryWriter import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers import paddle.fluid as fluid -from parse import add_config_options_to_parser -from pprint import pprint +from parakeet.models.dataloader.ljspeech import LJSpeechLoader +from parakeet.models.transformerTTS.network import TransformerTTS from network import FastSpeech from utils import get_alignment -from parakeet.models.dataloader.jlspeech import LJSpeechLoader -from parakeet.models.transformerTTS.network import TransformerTTS -class MyDataParallel(dg.parallel.DataParallel): - """ - A data parallel proxy for model. - """ - - def __init__(self, layers, strategy): - super(MyDataParallel, self).__init__(layers, strategy) - - def __getattr__(self, key): - if key in self.__dict__: - return object.__getattribute__(self, key) - elif key is "_layers": - return object.__getattribute__(self, "_sub_layers")["_layers"] +def load_checkpoint(step, model_path): + model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + new_state_dict = OrderedDict() + for param in model_dict: + if param.startswith('_layers.'): + new_state_dict[param[8:]] = model_dict[param] else: - return getattr( - object.__getattribute__(self, "_sub_layers")["_layers"], key) + new_state_dict[param] = model_dict[param] + return new_state_dict, opti_dict def main(cfg): @@ -57,8 +51,7 @@ def main(cfg): with dg.guard(place): with fluid.unique_name.guard(): transformerTTS = TransformerTTS(cfg) - model_path = os.path.join(cfg.transtts_path, "transformer") - model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) + model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer")) transformerTTS.set_dict(model_dict) transformerTTS.eval() @@ -67,27 +60,29 @@ def main(cfg): model.train() optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), parameter_list=model.parameters()) - reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() if cfg.checkpoint_path is not None: - model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model_dict, opti_dict = load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) + global_step = cfg.fastspeech_step print("load checkpoint!!!") if cfg.use_data_parallel: strategy = dg.parallel.prepare_context() - model = MyDataParallel(model, strategy) + model = fluid.dygraph.parallel.DataParallel(model, strategy) for epoch in range(cfg.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) - character, mel, mel_input, pos_text, pos_mel, text_length = data + character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) - alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32) + alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg.transformer_head)).astype(np.float32) + global_step += 1 #Forward @@ -102,7 +97,6 @@ def main(cfg): total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank==0: - #print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py index 7517a13..a94de8d 100644 --- a/parakeet/models/fastspeech/utils.py +++ b/parakeet/models/fastspeech/utils.py @@ -1,9 +1,10 @@ import numpy as np -def get_alignment(attn_probs, n_head): +def get_alignment(attn_probs, mel_lens, n_head): max_F = 0 assert attn_probs[0].shape[0] % n_head == 0 batch_size = int(attn_probs[0].shape[0] // n_head) + #max_attn = attn_probs[0].numpy()[0,batch_size] for i in range(len(attn_probs)): multi_attn = attn_probs[i].numpy() for j in range(n_head): @@ -12,7 +13,7 @@ def get_alignment(attn_probs, n_head): if max_F < F: max_F = F max_attn = attn - alignment = compute_duration(max_attn) + alignment = compute_duration(max_attn, mel_lens) return alignment def score_F(attn): @@ -20,11 +21,12 @@ def score_F(attn): mean = np.mean(max) return mean -def compute_duration(attn): +def compute_duration(attn, mel_lens): alignment = np.zeros([attn.shape[0],attn.shape[2]]) + mel_lens = mel_lens.numpy() for i in range(attn.shape[0]): - for j in range(attn.shape[1]): - max_index = attn[i,j].tolist().index(attn[i,j].max()) + for j in range(mel_lens[i]): + max_index = np.argmax(attn[i,j]) alignment[i,max_index] += 1 return alignment diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml index 8847a6e..97b5f0f 100644 --- a/parakeet/models/transformerTTS/config/train_transformer.yaml +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -24,11 +24,12 @@ save_step: 1000 image_step: 2000 use_gpu: True use_data_parallel: False +stop_token: False data_path: ../../../dataset/LJSpeech-1.1 save_path: ./checkpoint log_dir: ./log #checkpoint_path: ./checkpoint -#transformer_step: 70000 +#ransformer_step: 97000 \ No newline at end of file diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py index 1a1ebd1..e8eb8b6 100644 --- a/parakeet/models/transformerTTS/module.py +++ b/parakeet/models/transformerTTS/module.py @@ -49,6 +49,7 @@ class EncoderPrenet(dg.Layer): x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) x = layers.transpose(x,[0,2,1]) #(N,T,C) x = self.projection(x) + return x class CBHG(dg.Layer): diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py index 584ea63..5f989de 100644 --- a/parakeet/models/transformerTTS/parse.py +++ b/parakeet/models/transformerTTS/parse.py @@ -44,13 +44,15 @@ def add_config_options_to_parser(parser): parser.add_argument('--max_len', type=int, default=400, help="The max length of audio when synthsis.") parser.add_argument('--transformer_step', type=int, default=160000, - help="Global step to restore checkpoint of transformer in synthesis.") - parser.add_argument('--postnet_step', type=int, default=100000, - help="Global step to restore checkpoint of postnet in synthesis.") + help="Global step to restore checkpoint of transformer.") + parser.add_argument('--postnet_step', type=int, default=90000, + help="Global step to restore checkpoint of postnet.") parser.add_argument('--use_gpu', type=bool, default=True, help="use gpu or not during training.") parser.add_argument('--use_data_parallel', type=bool, default=False, help="use data parallel or not during training.") + parser.add_argument('--stop_token', type=bool, default=False, + help="use stop token loss in network or not.") parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', help="the path of dataset.") diff --git a/parakeet/models/transformerTTS/preprocess.py b/parakeet/models/transformerTTS/preprocess.py deleted file mode 100644 index b128b00..0000000 --- a/parakeet/models/transformerTTS/preprocess.py +++ /dev/null @@ -1,123 +0,0 @@ -from pathlib import Path -import numpy as np -import pandas as pd -import librosa - -from parakeet import g2p -from parakeet import audio - -from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler -from parakeet.data.dataset import Dataset -from parakeet.data.datacargo import DataCargo -from parakeet.data.batch import TextIDBatcher, SpecBatcher - -_ljspeech_processor = audio.AudioProcessor( - sample_rate=22050, - num_mels=80, - min_level_db=-100, - ref_level_db=20, - n_fft=2048, - win_length= int(22050 * 0.05), - hop_length= int(22050 * 0.0125), - power=1.2, - preemphasis=0.97, - signal_norm=True, - symmetric_norm=False, - max_norm=1., - mel_fmin=0, - mel_fmax=None, - clip_norm=True, - griffin_lim_iters=60, - do_trim_silence=False, - sound_norm=False) - -class LJSpeech(Dataset): - def __init__(self, root): - super(LJSpeech, self).__init__() - assert isinstance(root, (str, Path)), "root should be a string or Path object" - self.root = root if isinstance(root, Path) else Path(root) - self.metadata = self._prepare_metadata() - - def _prepare_metadata(self): - csv_path = self.root.joinpath("metadata.csv") - metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, - names=["fname", "raw_text", "normalized_text"]) - return metadata - - def _get_example(self, metadatum): - """All the code for generating an Example from a metadatum. If you want a - different preprocessing pipeline, you can override this method. - This method may require several processor, each of which has a lot of options. - In this case, you'd better pass a composed transform and pass it to the init - method. - """ - - fname, raw_text, normalized_text = metadatum - wav_path = self.root.joinpath("wavs", fname + ".wav") - - # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize - wav = _ljspeech_processor.load_wav(str(wav_path)) - mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) - mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) - phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) - return (mag, mel, phonemes) # maybe we need to implement it as a map in the future - - def __getitem__(self, index): - metadatum = self.metadata.iloc[index] - example = self._get_example(metadatum) - return example - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __len__(self): - return len(self.metadata) - - -def batch_examples(batch): - texts = [] - mels = [] - mel_inputs = [] - text_lens = [] - pos_texts = [] - pos_mels = [] - for data in batch: - _, mel, text = data - mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) - text_lens.append(len(text)) - pos_texts.append(np.arange(1, len(text) + 1)) - pos_mels.append(np.arange(1, mel.shape[1] + 1)) - mels.append(mel) - texts.append(text) - - # Sort by text_len in descending order - texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] - mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] - mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] - pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] - pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] - text_lens = sorted(text_lens, reverse=True) - - # Pad sequence with largest len of the batch - texts = TextIDBatcher(pad_id=0)(texts) - pos_texts = TextIDBatcher(pad_id=0)(pos_texts) - pos_mels = TextIDBatcher(pad_id=0)(pos_mels) - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) - mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) - return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) - -def batch_examples_vocoder(batch): - mels=[] - mags=[] - for data in batch: - mag, mel, _ = data - mels.append(mel) - mags.append(mag) - - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) - mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) - - return (mels, mags) - - diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py index aeb9697..5420040 100644 --- a/parakeet/models/transformerTTS/synthesis.py +++ b/parakeet/models/transformerTTS/synthesis.py @@ -7,15 +7,22 @@ from tqdm import tqdm from tensorboardX import SummaryWriter import paddle.fluid as fluid import paddle.fluid.dygraph as dg -from preprocess import _ljspeech_processor from pathlib import Path import jsonargparse from parse import add_config_options_to_parser from pprint import pprint +from collections import OrderedDict +from parakeet import audio def load_checkpoint(step, model_path): - model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) - return model_dict + model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + new_state_dict = OrderedDict() + for param in model_dict: + if param.startswith('_layers.'): + new_state_dict[param[8:]] = model_dict[param] + else: + new_state_dict[param] = model_dict[param] + return new_state_dict def synthesis(text_input, cfg): place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) @@ -30,7 +37,7 @@ def synthesis(text_input, cfg): with dg.guard(place): with fluid.unique_name.guard(): model = TransformerTTS(cfg) - model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) + model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "nostop_token/transformer"))) model.eval() with fluid.unique_name.guard(): @@ -54,14 +61,35 @@ def synthesis(text_input, cfg): mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) mag_pred = model_postnet(postnet_pred) + _ljspeech_processor = audio.AudioProcessor( + sample_rate=cfg.audio.sr, + num_mels=cfg.audio.num_mels, + min_level_db=cfg.audio.min_level_db, + ref_level_db=cfg.audio.ref_level_db, + n_fft=cfg.audio.n_fft, + win_length= cfg.audio.win_length, + hop_length= cfg.audio.hop_length, + power=cfg.audio.power, + preemphasis=cfg.audio.preemphasis, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) writer.add_audio(text_input, wav, 0, cfg.audio.sr) if not os.path.exists(cfg.sample_path): os.mkdir(cfg.sample_path) write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) + writer.close() if __name__ == '__main__': parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') add_config_options_to_parser(parser) cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) - synthesis("Transformer model is so fast!", cfg) \ No newline at end of file + synthesis("Transformer model is so fast!", cfg) diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py index 6870caa..24d80d1 100644 --- a/parakeet/models/transformerTTS/train_postnet.py +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -1,33 +1,23 @@ -from network import * from tensorboardX import SummaryWriter import os from tqdm import tqdm from pathlib import Path +from collections import OrderedDict import jsonargparse from parse import add_config_options_to_parser from pprint import pprint -from parakeet.models.dataloader.jlspeech import LJSpeechLoader - -class MyDataParallel(dg.parallel.DataParallel): - """ - A data parallel proxy for model. - """ - - def __init__(self, layers, strategy): - super(MyDataParallel, self).__init__(layers, strategy) - - def __getattr__(self, key): - if key in self.__dict__: - return object.__getattribute__(self, key) - elif key is "_layers": - return object.__getattribute__(self, "_sub_layers")["_layers"] - else: - return getattr( - object.__getattribute__(self, "_sub_layers")["_layers"], key) +from parakeet.models.dataloader.ljspeech import LJSpeechLoader +from network import * def load_checkpoint(step, model_path): model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) - return model_dict, opti_dict + new_state_dict = OrderedDict() + for param in model_dict: + if param.startswith('_layers.'): + new_state_dict[param[8:]] = model_dict[param] + else: + new_state_dict[param] = model_dict[param] + return new_state_dict, opti_dict def main(cfg): @@ -66,7 +56,7 @@ def main(cfg): if cfg.use_data_parallel: strategy = dg.parallel.prepare_context() - model = MyDataParallel(model, strategy) + model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py index 81fae56..f8e85b0 100644 --- a/parakeet/models/transformerTTS/train_transformer.py +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -1,46 +1,33 @@ import os from tqdm import tqdm -import paddle.fluid.dygraph as dg -import paddle.fluid.layers as layers -from network import * from tensorboardX import SummaryWriter from pathlib import Path +from collections import OrderedDict import jsonargparse from parse import add_config_options_to_parser from pprint import pprint from matplotlib import cm +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers from parakeet.modules.utils import cross_entropy -from parakeet.models.dataloader.jlspeech import LJSpeechLoader - -class MyDataParallel(dg.parallel.DataParallel): - """ - A data parallel proxy for model. - """ - - def __init__(self, layers, strategy): - super(MyDataParallel, self).__init__(layers, strategy) - - def __getattr__(self, key): - if key in self.__dict__: - return object.__getattribute__(self, key) - elif key is "_layers": - return object.__getattribute__(self, "_sub_layers")["_layers"] - else: - return getattr( - object.__getattribute__(self, "_sub_layers")["_layers"], key) +from parakeet.models.dataloader.ljspeech import LJSpeechLoader +from network import * def load_checkpoint(step, model_path): model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) - return model_dict, opti_dict + new_state_dict = OrderedDict() + for param in model_dict: + if param.startswith('_layers.'): + new_state_dict[param[8:]] = model_dict[param] + else: + new_state_dict[param] = model_dict[param] + return new_state_dict, opti_dict def main(cfg): local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 - fluid.default_startup_program().random_seed = 1 - fluid.default_main_program().random_seed = 1 - if local_rank == 0: # Print the whole config setting. pprint(jsonargparse.namespace_to_dict(cfg)) @@ -74,28 +61,27 @@ def main(cfg): if cfg.use_data_parallel: strategy = dg.parallel.prepare_context() - model = MyDataParallel(model, strategy) + model = fluid.dygraph.parallel.DataParallel(model, strategy) for epoch in range(cfg.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) - character, mel, mel_input, pos_text, pos_mel, text_length = data + character, mel, mel_input, pos_text, pos_mel, text_length, _ = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) label = (pos_mel == 0).astype(np.float32) - #label = np.zeros(stop_preds.shape).astype(np.float32) - #text_length = text_length.numpy() - #for i in range(label.shape[0]): - # label[i][text_length[i] - 1] = 1 mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) - stop_loss = cross_entropy(stop_preds, label) - loss = mel_loss + post_mel_loss + stop_loss + loss = mel_loss + post_mel_loss + # Note: When used stop token loss the learning did not work. + if cfg.stop_token: + stop_loss = cross_entropy(stop_preds, label) + loss = loss + stop_loss if local_rank==0: writer.add_scalars('training_loss', {