diff --git a/examples/FastSpeech/config/fastspeech.yaml b/examples/FastSpeech/config/fastspeech.yaml deleted file mode 100644 index 05df0aa..0000000 --- a/examples/FastSpeech/config/fastspeech.yaml +++ /dev/null @@ -1,46 +0,0 @@ -audio: - num_mels: 80 #the number of mel bands when calculating mel spectrograms. - n_fft: 2048 #the number of fft components. - sr: 22050 #the sampling rate of audio data file. - preemphasis: 0.97 #the preemphasis coefficient. - hop_length: 256 #the number of samples to advance between frames. - win_length: 1024 #the length (width) of the window function. - power: 1.2 #the power to raise before griffin-lim. - min_level_db: -100 #the minimum level db. - ref_level_db: 20 #the reference level db. - outputs_per_step: 1 #the outputs per step. - -encoder_n_layer: 6 -encoder_head: 2 -encoder_conv1d_filter_size: 1536 -max_sep_len: 2048 -decoder_n_layer: 6 -decoder_head: 2 -decoder_conv1d_filter_size: 1536 -fs_hidden_size: 384 -duration_predictor_output_size: 256 -duration_predictor_filter_size: 3 -fft_conv1d_filter: 3 -fft_conv1d_padding: 1 -dropout: 0.1 -transformer_head: 4 - -embedding_size: 512 -hidden_size: 256 - -warm_up_step: 4000 -grad_clip_thresh: 0.1 -batch_size: 32 -epochs: 10000 -lr: 0.001 -save_step: 500 -use_gpu: True -use_data_parallel: True - -data_path: ../../dataset/LJSpeech-1.1 -transtts_path: ../TransformerTTS/checkpoint/ -transformer_step: 160000 -save_path: ./checkpoint -log_dir: ./log -#checkpoint_path: ./checkpoint -#transformer_step: 97000 diff --git a/examples/FastSpeech/parse.py b/examples/FastSpeech/parse.py deleted file mode 100644 index 894d988..0000000 --- a/examples/FastSpeech/parse.py +++ /dev/null @@ -1,97 +0,0 @@ -import jsonargparse - -def add_config_options_to_parser(parser): - parser.add_argument('--audio.num_mels', type=int, default=80, - help="the number of mel bands when calculating mel spectrograms.") - parser.add_argument('--audio.n_fft', type=int, default=2048, - help="the number of fft components.") - parser.add_argument('--audio.sr', type=int, default=22050, - help="the sampling rate of audio data file.") - parser.add_argument('--audio.preemphasis', type=float, default=0.97, - help="the preemphasis coefficient.") - parser.add_argument('--audio.hop_length', type=int, default=128, - help="the number of samples to advance between frames.") - parser.add_argument('--audio.win_length', type=int, default=1024, - help="the length (width) of the window function.") - parser.add_argument('--audio.power', type=float, default=1.4, - help="the power to raise before griffin-lim.") - parser.add_argument('--audio.min_level_db', type=int, default=-100, - help="the minimum level db.") - parser.add_argument('--audio.ref_level_db', type=int, default=20, - help="the reference level db.") - parser.add_argument('--audio.outputs_per_step', type=int, default=1, - help="the outputs per step.") - - parser.add_argument('--encoder_n_layer', type=int, default=6, - help="the number of FFT Block in encoder.") - parser.add_argument('--encoder_head', type=int, default=2, - help="the attention head number in encoder.") - parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024, - help="the filter size of conv1d in encoder.") - parser.add_argument('--max_sep_len', type=int, default=2048, - help="the max length of sequence.") - parser.add_argument('--decoder_n_layer', type=int, default=6, - help="the number of FFT Block in decoder.") - parser.add_argument('--decoder_head', type=int, default=2, - help="the attention head number in decoder.") - parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, - help="the filter size of conv1d in decoder.") - parser.add_argument('--fs_hidden_size', type=int, default=256, - help="the hidden size in model of fastspeech.") - parser.add_argument('--duration_predictor_output_size', type=int, default=256, - help="the output size of duration predictior.") - parser.add_argument('--duration_predictor_filter_size', type=int, default=3, - help="the filter size of conv1d in duration prediction.") - parser.add_argument('--fft_conv1d_filter', type=int, default=3, - help="the filter size of conv1d in fft.") - parser.add_argument('--fft_conv1d_padding', type=int, default=1, - help="the padding size of conv1d in fft.") - parser.add_argument('--dropout', type=float, default=0.1, - help="the dropout in network.") - parser.add_argument('--transformer_head', type=int, default=4, - help="the attention head num of transformerTTS.") - parser.add_argument('--alpha', type=float, default=1.0, - help="the hyperparameter to determine the length of the expanded sequence\ - mel, thereby controlling the voice speed.") - - parser.add_argument('--hidden_size', type=int, default=256, - help="the hidden size in model of transformerTTS.") - parser.add_argument('--embedding_size', type=int, default=256, - help="the dim size of embedding of transformerTTS.") - - parser.add_argument('--warm_up_step', type=int, default=4000, - help="the warm up step of learning rate.") - parser.add_argument('--grad_clip_thresh', type=float, default=1.0, - help="the threshold of grad clip.") - parser.add_argument('--batch_size', type=int, default=32, - help="batch size for training.") - parser.add_argument('--epochs', type=int, default=10000, - help="the number of epoch for training.") - parser.add_argument('--lr', type=float, default=0.001, - help="the learning rate for training.") - parser.add_argument('--save_step', type=int, default=500, - help="checkpointing interval during training.") - parser.add_argument('--fastspeech_step', type=int, default=160000, - help="Global step to restore checkpoint of fastspeech.") - parser.add_argument('--use_gpu', type=bool, default=True, - help="use gpu or not during training.") - parser.add_argument('--use_data_parallel', type=bool, default=False, - help="use data parallel or not during training.") - - parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', - help="the path of dataset.") - parser.add_argument('--checkpoint_path', type=str, default=None, - help="the path to load checkpoint or pretrain model.") - parser.add_argument('--save_path', type=str, default='./checkpoint', - help="the path to save checkpoint.") - parser.add_argument('--log_dir', type=str, default='./log', - help="the directory to save tensorboard log.") - parser.add_argument('--sample_path', type=str, default='./sample', - help="the directory to save audio sample in synthesis.") - parser.add_argument('--transtts_path', type=str, default='./log', - help="the directory to load pretrain transformerTTS model.") - parser.add_argument('--transformer_step', type=int, default=70000, - help="the step to load transformerTTS model.") - - - parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/examples/TransformerTTS/config/train_transformer.yaml b/examples/TransformerTTS/config/train_transformer.yaml deleted file mode 100644 index 3065f48..0000000 --- a/examples/TransformerTTS/config/train_transformer.yaml +++ /dev/null @@ -1,35 +0,0 @@ -audio: - num_mels: 80 - n_fft: 2048 - sr: 22050 - preemphasis: 0.97 - hop_length: 275 - win_length: 1102 - power: 1.2 - min_level_db: -100 - ref_level_db: 20 - outputs_per_step: 1 - - -hidden_size: 256 -embedding_size: 512 - - -warm_up_step: 4000 -grad_clip_thresh: 1.0 -batch_size: 32 -epochs: 10000 -lr: 0.001 -save_step: 1000 -image_step: 2000 -use_gpu: True -use_data_parallel: False -stop_token: False - -data_path: ../../dataset/LJSpeech-1.1 -save_path: ./checkpoint -log_dir: ./log -#checkpoint_path: ./checkpoint -#ransformer_step: 97000 - - \ No newline at end of file diff --git a/examples/TransformerTTS/config/train_vocoder.yaml b/examples/TransformerTTS/config/train_vocoder.yaml deleted file mode 100644 index 3c37d4a..0000000 --- a/examples/TransformerTTS/config/train_vocoder.yaml +++ /dev/null @@ -1,29 +0,0 @@ -audio: - num_mels: 80 - n_fft: 2048 - sr: 22050 - preemphasis: 0.97 - hop_length: 275 - win_length: 1102 - power: 1.2 - min_level_db: -100 - ref_level_db: 20 - outputs_per_step: 1 - -hidden_size: 256 -embedding_size: 512 - -warm_up_step: 4000 -grad_clip_thresh: 1.0 -batch_size: 32 -epochs: 10000 -lr: 0.001 -save_step: 10 -use_gpu: True -use_data_parallel: True - -data_path: ../../dataset/LJSpeech-1.1 -save_path: ./checkpoint -log_dir: ./log -#checkpoint_path: ./checkpoint -#transformer_step: 27000 \ No newline at end of file diff --git a/examples/TransformerTTS/parse.py b/examples/TransformerTTS/parse.py deleted file mode 100644 index 5f989de..0000000 --- a/examples/TransformerTTS/parse.py +++ /dev/null @@ -1,69 +0,0 @@ -import jsonargparse - -def add_config_options_to_parser(parser): - parser.add_argument('--audio.num_mels', type=int, default=80, - help="the number of mel bands when calculating mel spectrograms.") - parser.add_argument('--audio.n_fft', type=int, default=2048, - help="the number of fft components.") - parser.add_argument('--audio.sr', type=int, default=22050, - help="the sampling rate of audio data file.") - parser.add_argument('--audio.preemphasis', type=float, default=0.97, - help="the preemphasis coefficient.") - parser.add_argument('--audio.hop_length', type=int, default=128, - help="the number of samples to advance between frames.") - parser.add_argument('--audio.win_length', type=int, default=1024, - help="the length (width) of the window function.") - parser.add_argument('--audio.power', type=float, default=1.4, - help="the power to raise before griffin-lim.") - parser.add_argument('--audio.min_level_db', type=int, default=-100, - help="the minimum level db.") - parser.add_argument('--audio.ref_level_db', type=int, default=20, - help="the reference level db.") - parser.add_argument('--audio.outputs_per_step', type=int, default=1, - help="the outputs per step.") - - parser.add_argument('--hidden_size', type=int, default=256, - help="the hidden size in network.") - parser.add_argument('--embedding_size', type=int, default=512, - help="the embedding vector size.") - - parser.add_argument('--warm_up_step', type=int, default=4000, - help="the warm up step of learning rate.") - parser.add_argument('--grad_clip_thresh', type=float, default=1.0, - help="the threshold of grad clip.") - parser.add_argument('--batch_size', type=int, default=32, - help="batch size for training.") - parser.add_argument('--epochs', type=int, default=10000, - help="the number of epoch for training.") - parser.add_argument('--lr', type=float, default=0.001, - help="the learning rate for training.") - parser.add_argument('--save_step', type=int, default=500, - help="checkpointing interval during training.") - parser.add_argument('--image_step', type=int, default=2000, - help="attention image interval during training.") - parser.add_argument('--max_len', type=int, default=400, - help="The max length of audio when synthsis.") - parser.add_argument('--transformer_step', type=int, default=160000, - help="Global step to restore checkpoint of transformer.") - parser.add_argument('--postnet_step', type=int, default=90000, - help="Global step to restore checkpoint of postnet.") - parser.add_argument('--use_gpu', type=bool, default=True, - help="use gpu or not during training.") - parser.add_argument('--use_data_parallel', type=bool, default=False, - help="use data parallel or not during training.") - parser.add_argument('--stop_token', type=bool, default=False, - help="use stop token loss in network or not.") - - parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', - help="the path of dataset.") - parser.add_argument('--checkpoint_path', type=str, default=None, - help="the path to load checkpoint or pretrain model.") - parser.add_argument('--save_path', type=str, default='./checkpoint', - help="the path to save checkpoint.") - parser.add_argument('--log_dir', type=str, default='./log', - help="the directory to save tensorboard log.") - parser.add_argument('--sample_path', type=str, default='./log', - help="the directory to save audio sample in synthesis.") - - - parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/examples/FastSpeech/README.md b/examples/fastspeech/README.md similarity index 100% rename from examples/FastSpeech/README.md rename to examples/fastspeech/README.md diff --git a/examples/fastspeech/config/fastspeech.yaml b/examples/fastspeech/config/fastspeech.yaml new file mode 100644 index 0000000..5f98157 --- /dev/null +++ b/examples/fastspeech/config/fastspeech.yaml @@ -0,0 +1,32 @@ +audio: + num_mels: 80 #the number of mel bands when calculating mel spectrograms. + n_fft: 2048 #the number of fft components. + sr: 22050 #the sampling rate of audio data file. + preemphasis: 0.97 #the preemphasis coefficient. + hop_length: 256 #the number of samples to advance between frames. + win_length: 1024 #the length (width) of the window function. + power: 1.2 #the power to raise before griffin-lim. + min_level_db: -100 #the minimum level db. + ref_level_db: 20 #the reference level db. + outputs_per_step: 1 #the outputs per step. + +encoder_n_layer: 6 #the number of FFT Block in encoder. +encoder_head: 2 #the attention head number in encoder. +encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder. +max_seq_len: 2048 #the max length of sequence. +decoder_n_layer: 6 #the number of FFT Block in decoder. +decoder_head: 2 #the attention head number in decoder. +decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder. +fs_hidden_size: 384 #the hidden size in model of fastspeech. +duration_predictor_output_size: 256 #the output size of duration predictior. +duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction. +fft_conv1d_filter: 3 #the filter size of conv1d in fft. +fft_conv1d_padding: 1 #the padding size of conv1d in fft. +dropout: 0.1 #the dropout in network. +transformer_head: 4 #the attention head num of transformerTTS. + +embedding_size: 512 #the dim size of embedding of transformerTTS. +hidden_size: 256 #the hidden size in model of transformerTTS. +warm_up_step: 4000 #the warm up step of learning rate. +grad_clip_thresh: 0.1 #the threshold of grad clip. + diff --git a/examples/FastSpeech/config/synthesis.yaml b/examples/fastspeech/config/synthesis.yaml similarity index 77% rename from examples/FastSpeech/config/synthesis.yaml rename to examples/fastspeech/config/synthesis.yaml index 2841b2e..9a43dff 100644 --- a/examples/FastSpeech/config/synthesis.yaml +++ b/examples/fastspeech/config/synthesis.yaml @@ -13,7 +13,7 @@ audio: encoder_n_layer: 6 encoder_head: 2 encoder_conv1d_filter_size: 1536 -max_sep_len: 2048 +max_seq_len: 2048 decoder_n_layer: 6 decoder_head: 2 decoder_conv1d_filter_size: 1536 @@ -23,11 +23,4 @@ duration_predictor_filter_size: 3 fft_conv1d_filter: 3 fft_conv1d_padding: 1 dropout: 0.1 -transformer_head: 4 - -use_gpu: True -alpha: 1.0 - -checkpoint_path: checkpoint/ -fastspeech_step: 71000 -log_dir: ./log \ No newline at end of file +transformer_head: 4 \ No newline at end of file diff --git a/examples/fastspeech/parse.py b/examples/fastspeech/parse.py new file mode 100644 index 0000000..a6c2d99 --- /dev/null +++ b/examples/fastspeech/parse.py @@ -0,0 +1,36 @@ +import argparse + +def add_config_options_to_parser(parser): + parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml', + help="the yaml config file path.") + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--fastspeech_step', type=int, default=70000, + help="Global step to restore checkpoint of fastspeech.") + parser.add_argument('--use_gpu', type=int, default=1, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=int, default=0, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./sample', + help="the directory to save audio sample in synthesis.") + parser.add_argument('--transtts_path', type=str, default='./log', + help="the directory to load pretrain transformerTTS model.") + parser.add_argument('--transformer_step', type=int, default=160000, + help="the step to load transformerTTS model.") + + diff --git a/examples/FastSpeech/synthesis.py b/examples/fastspeech/synthesis.py similarity index 59% rename from examples/FastSpeech/synthesis.py rename to examples/fastspeech/synthesis.py index 779af02..6a3d146 100644 --- a/examples/FastSpeech/synthesis.py +++ b/examples/fastspeech/synthesis.py @@ -1,15 +1,16 @@ import os from tensorboardX import SummaryWriter from collections import OrderedDict -import jsonargparse +import argparse from parse import add_config_options_to_parser from pprint import pprint +from ruamel import yaml import numpy as np import paddle.fluid as fluid import paddle.fluid.dygraph as dg from parakeet.g2p.en import text_to_sequence from parakeet import audio -from network import FastSpeech +from parakeet.models.fastspeech.fastspeech import FastSpeech def load_checkpoint(step, model_path): model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) @@ -21,19 +22,22 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict -def synthesis(text_input, cfg): - place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) +def synthesis(text_input, args): + place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) # tensorboard - if not os.path.exists(cfg.log_dir): - os.mkdir(cfg.log_dir) - path = os.path.join(cfg.log_dir,'synthesis') + if not os.path.exists(args.log_dir): + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir,'synthesis') + + with open(args.config_path) as f: + cfg = yaml.load(f, Loader=yaml.Loader) writer = SummaryWriter(path) with dg.guard(place): model = FastSpeech(cfg) - model.set_dict(load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech"))) + model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))) model.eval() text = np.asarray(text_to_sequence(text_input)) @@ -41,18 +45,18 @@ def synthesis(text_input, cfg): pos_text = np.arange(1, text.shape[1]+1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) - mel_output, mel_output_postnet = model(text, pos_text, alpha=cfg.alpha) + mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha) _ljspeech_processor = audio.AudioProcessor( - sample_rate=cfg.audio.sr, - num_mels=cfg.audio.num_mels, - min_level_db=cfg.audio.min_level_db, - ref_level_db=cfg.audio.ref_level_db, - n_fft=cfg.audio.n_fft, - win_length= cfg.audio.win_length, - hop_length= cfg.audio.hop_length, - power=cfg.audio.power, - preemphasis=cfg.audio.preemphasis, + sample_rate=cfg['audio']['sr'], + num_mels=cfg['audio']['num_mels'], + min_level_db=cfg['audio']['min_level_db'], + ref_level_db=cfg['audio']['ref_level_db'], + n_fft=cfg['audio']['n_fft'], + win_length= cfg['audio']['win_length'], + hop_length= cfg['audio']['hop_length'], + power=cfg['audio']['power'], + preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., @@ -65,12 +69,12 @@ def synthesis(text_input, cfg): mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) - writer.add_audio(text_input, wav, 0, cfg.audio.sr) + writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) print("Synthesis completed !!!") writer.close() if __name__ == '__main__': - parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') + parser = argparse.ArgumentParser(description="Train Fastspeech model") add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) - synthesis("Transformer model is so fast!", cfg) \ No newline at end of file + args = parser.parse_args() + synthesis("Transformer model is so fast!", args) \ No newline at end of file diff --git a/examples/FastSpeech/train.py b/examples/fastspeech/train.py similarity index 68% rename from examples/FastSpeech/train.py rename to examples/fastspeech/train.py index 4f79705..48c26eb 100644 --- a/examples/FastSpeech/train.py +++ b/examples/fastspeech/train.py @@ -3,10 +3,10 @@ import argparse import os import time import math -import jsonargparse from pathlib import Path from parse import add_config_options_to_parser from pprint import pprint +from ruamel import yaml from tqdm import tqdm from collections import OrderedDict from tensorboardX import SummaryWriter @@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers import paddle.fluid as fluid from parakeet.models.dataloader.ljspeech import LJSpeechLoader -from parakeet.models.transformerTTS.transformerTTS import TransformerTTS +from parakeet.models.transformer_tts.transformerTTS import TransformerTTS from parakeet.models.fastspeech.fastspeech import FastSpeech from parakeet.models.fastspeech.utils import get_alignment @@ -28,50 +28,49 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict, opti_dict -def main(cfg): - local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 - nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 +def main(args): + local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 - if local_rank == 0: - # Print the whole config setting. - pprint(jsonargparse.namespace_to_dict(cfg)) + with open(args.config_path) as f: + cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) - if cfg.use_data_parallel else fluid.CUDAPlace(0) - if cfg.use_gpu else fluid.CPUPlace()) + if args.use_data_parallel else fluid.CUDAPlace(0) + if args.use_gpu else fluid.CPUPlace()) - if not os.path.exists(cfg.log_dir): - os.mkdir(cfg.log_dir) - path = os.path.join(cfg.log_dir,'fastspeech') + if not os.path.exists(args.log_dir): + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir,'fastspeech') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): with fluid.unique_name.guard(): transformerTTS = TransformerTTS(cfg) - model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer")) + model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) transformerTTS.set_dict(model_dict) transformerTTS.eval() model = FastSpeech(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), parameter_list=model.parameters()) - reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() + reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() - if cfg.checkpoint_path is not None: - model_dict, opti_dict = load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech")) + if args.checkpoint_path is not None: + model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) - global_step = cfg.fastspeech_step + global_step = args.fastspeech_step print("load checkpoint!!!") - if cfg.use_data_parallel: + if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) - for epoch in range(cfg.epochs): + for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): @@ -79,7 +78,7 @@ def main(cfg): character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) - alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg.transformer_head)).astype(np.float32) + alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32) global_step += 1 @@ -101,20 +100,20 @@ def main(cfg): writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) - if cfg.use_data_parallel: + if args.use_data_parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() - optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) model.clear_gradients() # save checkpoint - if local_rank==0 and global_step % cfg.save_step == 0: - if not os.path.exists(cfg.save_path): - os.mkdir(cfg.save_path) - save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step) + if local_rank==0 and global_step % args.save_step == 0: + if not os.path.exists(args.save_path): + os.mkdir(args.save_path) + save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank==0: @@ -122,7 +121,9 @@ def main(cfg): if __name__ =='__main__': - parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') + parser = argparse.ArgumentParser(description="Train Fastspeech model") add_config_options_to_parser(parser) - cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) - main(cfg) + args = parser.parse_args() + # Print the whole config setting. + pprint(args) + main(args) diff --git a/examples/TransformerTTS/README.md b/examples/transformer_tts/README.md similarity index 100% rename from examples/TransformerTTS/README.md rename to examples/transformer_tts/README.md diff --git a/examples/transformer_tts/config/synthesis.yaml b/examples/transformer_tts/config/synthesis.yaml new file mode 100644 index 0000000..217dd85 --- /dev/null +++ b/examples/transformer_tts/config/synthesis.yaml @@ -0,0 +1,11 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 \ No newline at end of file diff --git a/examples/TransformerTTS/config/synthesis.yaml b/examples/transformer_tts/config/train_transformer.yaml similarity index 58% rename from examples/TransformerTTS/config/synthesis.yaml rename to examples/transformer_tts/config/train_transformer.yaml index 413e816..21abb01 100644 --- a/examples/TransformerTTS/config/synthesis.yaml +++ b/examples/transformer_tts/config/train_transformer.yaml @@ -10,11 +10,11 @@ audio: ref_level_db: 20 outputs_per_step: 1 -max_len: 50 -transformer_step: 10 -postnet_step: 10 -use_gpu: True -checkpoint_path: ./checkpoint -log_dir: ./log -sample_path: ./sample \ No newline at end of file +hidden_size: 256 +embedding_size: 512 +warm_up_step: 4000 +grad_clip_thresh: 1.0 + + + \ No newline at end of file diff --git a/examples/transformer_tts/config/train_vocoder.yaml b/examples/transformer_tts/config/train_vocoder.yaml new file mode 100644 index 0000000..6ef3152 --- /dev/null +++ b/examples/transformer_tts/config/train_vocoder.yaml @@ -0,0 +1,16 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +hidden_size: 256 +embedding_size: 512 +warm_up_step: 4000 +grad_clip_thresh: 1.0 \ No newline at end of file diff --git a/examples/transformer_tts/parse.py b/examples/transformer_tts/parse.py new file mode 100644 index 0000000..aebce96 --- /dev/null +++ b/examples/transformer_tts/parse.py @@ -0,0 +1,38 @@ +import argparse + +def add_config_options_to_parser(parser): + parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml', + help="the yaml config file path.") + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--image_step', type=int, default=2000, + help="attention image interval during training.") + parser.add_argument('--max_len', type=int, default=400, + help="The max length of audio when synthsis.") + parser.add_argument('--transformer_step', type=int, default=160000, + help="Global step to restore checkpoint of transformer.") + parser.add_argument('--vocoder_step', type=int, default=90000, + help="Global step to restore checkpoint of postnet.") + parser.add_argument('--use_gpu', type=int, default=1, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=int, default=0, + help="use data parallel or not during training.") + parser.add_argument('--stop_token', type=int, default=0, + help="use stop token loss in network or not.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./sample', + help="the directory to save audio sample in synthesis.") diff --git a/examples/TransformerTTS/synthesis.py b/examples/transformer_tts/synthesis.py similarity index 58% rename from examples/TransformerTTS/synthesis.py rename to examples/transformer_tts/synthesis.py index 5420040..d0c155c 100644 --- a/examples/TransformerTTS/synthesis.py +++ b/examples/transformer_tts/synthesis.py @@ -2,17 +2,19 @@ import os from scipy.io.wavfile import write from parakeet.g2p.en import text_to_sequence import numpy as np -from network import TransformerTTS, ModelPostNet from tqdm import tqdm from tensorboardX import SummaryWriter +from ruamel import yaml import paddle.fluid as fluid import paddle.fluid.dygraph as dg from pathlib import Path -import jsonargparse +import argparse from parse import add_config_options_to_parser from pprint import pprint from collections import OrderedDict from parakeet import audio +from parakeet.models.transformer_tts.vocoder import Vocoder +from parakeet.models.transformer_tts.transformerTTS import TransformerTTS def load_checkpoint(step, model_path): model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) @@ -24,25 +26,28 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict -def synthesis(text_input, cfg): - place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) +def synthesis(text_input, args): + place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) + + with open(args.config_path) as f: + cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard - if not os.path.exists(cfg.log_dir): - os.mkdir(cfg.log_dir) - path = os.path.join(cfg.log_dir,'synthesis') + if not os.path.exists(args.log_dir): + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir,'synthesis') writer = SummaryWriter(path) with dg.guard(place): with fluid.unique_name.guard(): model = TransformerTTS(cfg) - model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "nostop_token/transformer"))) + model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "nostop_token/transformer"))) model.eval() with fluid.unique_name.guard(): - model_postnet = ModelPostNet(cfg) - model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) + model_postnet = Vocoder(cfg, args.batch_size) + model_postnet.set_dict(load_checkpoint(str(args.postnet_step), os.path.join(args.checkpoint_path, "postnet"))) model_postnet.eval() # init input text = np.asarray(text_to_sequence(text_input)) @@ -52,7 +57,7 @@ def synthesis(text_input, cfg): pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) - pbar = tqdm(range(cfg.max_len)) + pbar = tqdm(range(args.max_len)) for i in pbar: pos_mel = np.arange(1, mel_input.shape[1]+1) @@ -62,15 +67,15 @@ def synthesis(text_input, cfg): mag_pred = model_postnet(postnet_pred) _ljspeech_processor = audio.AudioProcessor( - sample_rate=cfg.audio.sr, - num_mels=cfg.audio.num_mels, - min_level_db=cfg.audio.min_level_db, - ref_level_db=cfg.audio.ref_level_db, - n_fft=cfg.audio.n_fft, - win_length= cfg.audio.win_length, - hop_length= cfg.audio.hop_length, - power=cfg.audio.power, - preemphasis=cfg.audio.preemphasis, + sample_rate=cfg['audio']['sr'], + num_mels=cfg['audio']['num_mels'], + min_level_db=cfg['audio']['min_level_db'], + ref_level_db=cfg['audio']['ref_level_db'], + n_fft=cfg['audio']['n_fft'], + win_length= cfg['audio']['win_length'], + hop_length= cfg['audio']['hop_length'], + power=cfg['audio']['power'], + preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., @@ -82,14 +87,14 @@ def synthesis(text_input, cfg): sound_norm=False) wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) - writer.add_audio(text_input, wav, 0, cfg.audio.sr) - if not os.path.exists(cfg.sample_path): - os.mkdir(cfg.sample_path) - write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) + writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) + if not os.path.exists(args.sample_path): + os.mkdir(args.sample_path) + write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav) writer.close() if __name__ == '__main__': - parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') + parser = argparse.ArgumentParser(description="Synthesis model") add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) - synthesis("Transformer model is so fast!", cfg) + args = parser.parse_args() + synthesis("Transformer model is so fast!", args) diff --git a/examples/TransformerTTS/train_transformer.py b/examples/transformer_tts/train_transformer.py similarity index 72% rename from examples/TransformerTTS/train_transformer.py rename to examples/transformer_tts/train_transformer.py index 4d046cb..bcfa16f 100644 --- a/examples/TransformerTTS/train_transformer.py +++ b/examples/transformer_tts/train_transformer.py @@ -3,9 +3,10 @@ from tqdm import tqdm from tensorboardX import SummaryWriter from pathlib import Path from collections import OrderedDict -import jsonargparse +import argparse from parse import add_config_options_to_parser from pprint import pprint +from ruamel import yaml from matplotlib import cm import numpy as np import paddle.fluid as fluid @@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers from parakeet.modules.utils import cross_entropy from parakeet.models.dataloader.ljspeech import LJSpeechLoader -from parakeet.models.transformerTTS.transformerTTS import TransformerTTS +from parakeet.models.transformer_tts.transformerTTS import TransformerTTS def load_checkpoint(step, model_path): model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) @@ -26,22 +27,21 @@ def load_checkpoint(step, model_path): return new_state_dict, opti_dict -def main(cfg): - local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 - nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 +def main(args): + local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 - if local_rank == 0: - # Print the whole config setting. - pprint(jsonargparse.namespace_to_dict(cfg)) + with open(args.config_path) as f: + cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) - if cfg.use_data_parallel else fluid.CUDAPlace(0) - if cfg.use_gpu else fluid.CPUPlace()) + if args.use_data_parallel else fluid.CUDAPlace(0) + if args.use_gpu else fluid.CPUPlace()) - if not os.path.exists(cfg.log_dir): - os.mkdir(cfg.log_dir) - path = os.path.join(cfg.log_dir,'transformer') + if not os.path.exists(args.log_dir): + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir,'transformer') writer = SummaryWriter(path) if local_rank == 0 else None @@ -49,23 +49,23 @@ def main(cfg): model = TransformerTTS(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), parameter_list=model.parameters()) - reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() - - if cfg.checkpoint_path is not None: - model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")) + reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() + + if args.checkpoint_path is not None: + model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) - global_step = cfg.transformer_step + global_step = args.transformer_step print("load checkpoint!!!") - if cfg.use_data_parallel: + if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) - for epoch in range(cfg.epochs): + for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) @@ -81,7 +81,7 @@ def main(cfg): post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. - if cfg.stop_token: + if args.stop_token: stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss @@ -91,7 +91,7 @@ def main(cfg): 'post_mel_loss':post_mel_loss.numpy() }, global_step) - if cfg.stop_token: + if args.stop_token: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) writer.add_scalars('alphas', { @@ -101,7 +101,7 @@ def main(cfg): writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) - if global_step % cfg.image_step == 1: + if global_step % args.image_step == 1: for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) @@ -117,20 +117,20 @@ def main(cfg): x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") - if cfg.use_data_parallel: + if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() - optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) model.clear_gradients() # save checkpoint - if local_rank==0 and global_step % cfg.save_step == 0: - if not os.path.exists(cfg.save_path): - os.mkdir(cfg.save_path) - save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step) + if local_rank==0 and global_step % args.save_step == 0: + if not os.path.exists(args.save_path): + os.mkdir(args.save_path) + save_path = os.path.join(args.save_path,'transformer/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank==0: @@ -138,7 +138,10 @@ def main(cfg): if __name__ =='__main__': - parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') + parser = argparse.ArgumentParser(description="Train TransformerTTS model") add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split()) - main(cfg) \ No newline at end of file + + args = parser.parse_args() + # Print the whole config setting. + pprint(args) + main(args) \ No newline at end of file diff --git a/examples/TransformerTTS/train_vocoder.py b/examples/transformer_tts/train_vocoder.py similarity index 61% rename from examples/TransformerTTS/train_vocoder.py rename to examples/transformer_tts/train_vocoder.py index b73f28c..b2db5fc 100644 --- a/examples/TransformerTTS/train_vocoder.py +++ b/examples/transformer_tts/train_vocoder.py @@ -3,14 +3,15 @@ import os from tqdm import tqdm from pathlib import Path from collections import OrderedDict -import jsonargparse +import argparse +from ruamel import yaml from parse import add_config_options_to_parser from pprint import pprint import paddle.fluid as fluid import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers from parakeet.models.dataloader.ljspeech import LJSpeechLoader -from parakeet.models.transformerTTS.vocoder import Vocoder +from parakeet.models.transformer_tts.vocoder import Vocoder def load_checkpoint(step, model_path): model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) @@ -22,48 +23,47 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict, opti_dict -def main(cfg): +def main(args): - local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 - nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 - - if local_rank == 0: - # Print the whole config setting. - pprint(jsonargparse.namespace_to_dict(cfg)) + local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 + + with open(args.config_path) as f: + cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) - if cfg.use_data_parallel else fluid.CUDAPlace(0) - if cfg.use_gpu else fluid.CPUPlace()) + if args.use_data_parallel else fluid.CUDAPlace(0) + if args.use_gpu else fluid.CPUPlace()) - if not os.path.exists(cfg.log_dir): - os.mkdir(cfg.log_dir) - path = os.path.join(cfg.log_dir,'postnet') + if not os.path.exists(args.log_dir): + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir,'postnet') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): - model = Vocoder(cfg) + model = Vocoder(cfg, args.batch_size) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), parameter_list=model.parameters()) - if cfg.checkpoint_path is not None: - model_dict, opti_dict = load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")) + if args.checkpoint_path is not None: + model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "postnet")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) - global_step = cfg.postnet_step + global_step = args.vocoder_step print("load checkpoint!!!") - if cfg.use_data_parallel: + if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) - reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() + reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader() - for epoch in range(cfg.epochs): + for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d'%epoch) @@ -75,13 +75,13 @@ def main(cfg): mag_pred = model(mel) loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) - if cfg.use_data_parallel: + if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() - optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) model.clear_gradients() if local_rank==0: @@ -89,10 +89,10 @@ def main(cfg): 'loss':loss.numpy(), }, global_step) - if global_step % cfg.save_step == 0: - if not os.path.exists(cfg.save_path): - os.mkdir(cfg.save_path) - save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) + if global_step % args.save_step == 0: + if not os.path.exists(args.save_path): + os.mkdir(args.save_path) + save_path = os.path.join(args.save_path,'postnet/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) @@ -100,7 +100,9 @@ def main(cfg): writer.close() if __name__ == '__main__': - parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') + parser = argparse.ArgumentParser(description="Train postnet model") add_config_options_to_parser(parser) - cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split()) - main(cfg) \ No newline at end of file + args = parser.parse_args() + # Print the whole config setting. + pprint(args) + main(args) \ No newline at end of file diff --git a/parakeet/models/dataloader/ljspeech.py b/parakeet/models/dataloader/ljspeech.py index 21f8fc9..9401b7b 100644 --- a/parakeet/models/dataloader/ljspeech.py +++ b/parakeet/models/dataloader/ljspeech.py @@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher from parakeet.data.dataset import DatasetMixin, TransformDataset class LJSpeechLoader: - def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): - place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True): + place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace() - LJSPEECH_ROOT = Path(config.data_path) + LJSPEECH_ROOT = Path(args.data_path) metadata = LJSpeechMetaData(LJSPEECH_ROOT) transformer = LJSpeech(config) dataset = TransformDataset(metadata, transformer) sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) - assert config.batch_size % nranks == 0 - each_bs = config.batch_size // nranks + assert args.batch_size % nranks == 0 + each_bs = args.batch_size // nranks if is_vocoder: dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) else: @@ -63,15 +63,15 @@ class LJSpeech(object): super(LJSpeech, self).__init__() self.config = config self._ljspeech_processor = audio.AudioProcessor( - sample_rate=config.audio.sr, - num_mels=config.audio.num_mels, - min_level_db=config.audio.min_level_db, - ref_level_db=config.audio.ref_level_db, - n_fft=config.audio.n_fft, - win_length= config.audio.win_length, - hop_length= config.audio.hop_length, - power=config.audio.power, - preemphasis=config.audio.preemphasis, + sample_rate=config['audio']['sr'], + num_mels=config['audio']['num_mels'], + min_level_db=config['audio']['min_level_db'], + ref_level_db=config['audio']['ref_level_db'], + n_fft=config['audio']['n_fft'], + win_length= config['audio']['win_length'], + hop_length= config['audio']['hop_length'], + power=config['audio']['power'], + preemphasis=config['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., diff --git a/parakeet/models/fastspeech/fastspeech.py b/parakeet/models/fastspeech/fastspeech.py index f533504..ee120b9 100644 --- a/parakeet/models/fastspeech/fastspeech.py +++ b/parakeet/models/fastspeech/fastspeech.py @@ -2,7 +2,7 @@ import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.g2p.text.symbols import symbols -from parakeet.models.transformerTTS.post_convnet import PostConvNet +from parakeet.models.transformer_tts.post_convnet import PostConvNet from parakeet.models.fastspeech.LengthRegulator import LengthRegulator from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.decoder import Decoder @@ -13,43 +13,43 @@ class FastSpeech(dg.Layer): super(FastSpeech, self).__init__() self.encoder = Encoder(n_src_vocab=len(symbols)+1, - len_max_seq=cfg.max_sep_len, - n_layers=cfg.encoder_n_layer, - n_head=cfg.encoder_head, - d_k=cfg.fs_hidden_size // cfg.encoder_head, - d_v=cfg.fs_hidden_size // cfg.encoder_head, - d_model=cfg.fs_hidden_size, - d_inner=cfg.encoder_conv1d_filter_size, - fft_conv1d_kernel=cfg.fft_conv1d_filter, - fft_conv1d_padding=cfg.fft_conv1d_padding, + len_max_seq=cfg['max_seq_len'], + n_layers=cfg['encoder_n_layer'], + n_head=cfg['encoder_head'], + d_k=cfg['fs_hidden_size'] // cfg['encoder_head'], + d_v=cfg['fs_hidden_size'] // cfg['encoder_head'], + d_model=cfg['fs_hidden_size'], + d_inner=cfg['encoder_conv1d_filter_size'], + fft_conv1d_kernel=cfg['fft_conv1d_filter'], + fft_conv1d_padding=cfg['fft_conv1d_padding'], dropout=0.1) - self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size, - out_channels=cfg.duration_predictor_output_size, - filter_size=cfg.duration_predictor_filter_size, - dropout=cfg.dropout) - self.decoder = Decoder(len_max_seq=cfg.max_sep_len, - n_layers=cfg.decoder_n_layer, - n_head=cfg.decoder_head, - d_k=cfg.fs_hidden_size // cfg.decoder_head, - d_v=cfg.fs_hidden_size // cfg.decoder_head, - d_model=cfg.fs_hidden_size, - d_inner=cfg.decoder_conv1d_filter_size, - fft_conv1d_kernel=cfg.fft_conv1d_filter, - fft_conv1d_padding=cfg.fft_conv1d_padding, + self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], + out_channels=cfg['duration_predictor_output_size'], + filter_size=cfg['duration_predictor_filter_size'], + dropout=cfg['dropout']) + self.decoder = Decoder(len_max_seq=cfg['max_seq_len'], + n_layers=cfg['decoder_n_layer'], + n_head=cfg['decoder_head'], + d_k=cfg['fs_hidden_size'] // cfg['decoder_head'], + d_v=cfg['fs_hidden_size'] // cfg['decoder_head'], + d_model=cfg['fs_hidden_size'], + d_inner=cfg['decoder_conv1d_filter_size'], + fft_conv1d_kernel=cfg['fft_conv1d_filter'], + fft_conv1d_padding=cfg['fft_conv1d_padding'], dropout=0.1) self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) - k = math.sqrt(1 / cfg.fs_hidden_size) + k = math.sqrt(1 / cfg['fs_hidden_size']) self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) - self.mel_linear = dg.Linear(cfg.fs_hidden_size, - cfg.audio.num_mels * cfg.audio.outputs_per_step, + self.mel_linear = dg.Linear(cfg['fs_hidden_size'], + cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'], param_attr = self.weight, bias_attr = self.bias,) - self.postnet = PostConvNet(n_mels=cfg.audio.num_mels, + self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'], num_hidden=512, filter_size=5, padding=int(5 / 2), num_conv=5, - outputs_per_step=cfg.audio.outputs_per_step, + outputs_per_step=cfg['audio']['outputs_per_step'], use_cudnn=True, dropout=0.1, batchnorm_last=True) diff --git a/parakeet/models/transformerTTS/CBHG.py b/parakeet/models/transformer_tts/CBHG.py similarity index 100% rename from parakeet/models/transformerTTS/CBHG.py rename to parakeet/models/transformer_tts/CBHG.py diff --git a/parakeet/models/transformerTTS/__init__.py b/parakeet/models/transformer_tts/__init__.py similarity index 100% rename from parakeet/models/transformerTTS/__init__.py rename to parakeet/models/transformer_tts/__init__.py diff --git a/parakeet/models/transformerTTS/decoder.py b/parakeet/models/transformer_tts/decoder.py similarity index 89% rename from parakeet/models/transformerTTS/decoder.py rename to parakeet/models/transformer_tts/decoder.py index 66b8f06..b0da788 100644 --- a/parakeet/models/transformerTTS/decoder.py +++ b/parakeet/models/transformer_tts/decoder.py @@ -4,8 +4,8 @@ import paddle.fluid as fluid from parakeet.modules.utils import * from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward -from parakeet.models.transformerTTS.prenet import PreNet -from parakeet.models.transformerTTS.post_convnet import PostConvNet +from parakeet.models.transformer_tts.prenet import PreNet +from parakeet.models.transformer_tts.post_convnet import PostConvNet class Decoder(dg.Layer): def __init__(self, num_hidden, config, num_head=4): @@ -20,7 +20,7 @@ class Decoder(dg.Layer): param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), trainable=False)) - self.decoder_prenet = PreNet(input_size = config.audio.num_mels, + self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], hidden_size = num_hidden * 2, output_size = num_hidden, dropout_rate=0.2) @@ -38,17 +38,17 @@ class Decoder(dg.Layer): self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step, + self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'], param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) self.stop_linear = dg.Linear(num_hidden, 1, param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) - self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, + self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], filter_size = 5, padding = 4, num_conv=5, - outputs_per_step=config.audio.outputs_per_step, - use_cudnn = config.use_gpu) + outputs_per_step=config['audio']['outputs_per_step'], + use_cudnn = True) def forward(self, key, value, query, c_mask, positional): diff --git a/parakeet/models/transformerTTS/encoder.py b/parakeet/models/transformer_tts/encoder.py similarity index 87% rename from parakeet/models/transformerTTS/encoder.py rename to parakeet/models/transformer_tts/encoder.py index b9ae6d5..1ce5fb9 100644 --- a/parakeet/models/transformerTTS/encoder.py +++ b/parakeet/models/transformer_tts/encoder.py @@ -3,10 +3,10 @@ import paddle.fluid as fluid from parakeet.modules.utils import * from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward -from parakeet.models.transformerTTS.encoderprenet import EncoderPrenet +from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet class Encoder(dg.Layer): - def __init__(self, embedding_size, num_hidden, config, num_head=4): + def __init__(self, embedding_size, num_hidden, num_head=4): super(Encoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) @@ -19,11 +19,11 @@ class Encoder(dg.Layer): trainable=False)) self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, num_hidden = num_hidden, - use_cudnn=config.use_gpu) + use_cudnn=True) self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) diff --git a/parakeet/models/transformerTTS/encoderprenet.py b/parakeet/models/transformer_tts/encoderprenet.py similarity index 100% rename from parakeet/models/transformerTTS/encoderprenet.py rename to parakeet/models/transformer_tts/encoderprenet.py diff --git a/parakeet/models/transformerTTS/post_convnet.py b/parakeet/models/transformer_tts/post_convnet.py similarity index 100% rename from parakeet/models/transformerTTS/post_convnet.py rename to parakeet/models/transformer_tts/post_convnet.py diff --git a/parakeet/models/transformerTTS/prenet.py b/parakeet/models/transformer_tts/prenet.py similarity index 100% rename from parakeet/models/transformerTTS/prenet.py rename to parakeet/models/transformer_tts/prenet.py diff --git a/parakeet/models/transformerTTS/transformerTTS.py b/parakeet/models/transformer_tts/transformerTTS.py similarity index 78% rename from parakeet/models/transformerTTS/transformerTTS.py rename to parakeet/models/transformer_tts/transformerTTS.py index 2a731ab..b275326 100644 --- a/parakeet/models/transformerTTS/transformerTTS.py +++ b/parakeet/models/transformer_tts/transformerTTS.py @@ -1,13 +1,13 @@ import paddle.fluid.dygraph as dg import paddle.fluid as fluid -from parakeet.models.transformerTTS.encoder import Encoder -from parakeet.models.transformerTTS.decoder import Decoder +from parakeet.models.transformer_tts.encoder import Encoder +from parakeet.models.transformer_tts.decoder import Decoder class TransformerTTS(dg.Layer): def __init__(self, config): super(TransformerTTS, self).__init__() - self.encoder = Encoder(config.embedding_size, config.hidden_size, config) - self.decoder = Decoder(config.hidden_size, config) + self.encoder = Encoder(config['embedding_size'], config['hidden_size']) + self.decoder = Decoder(config['hidden_size'], config) self.config = config def forward(self, characters, mel_input, pos_text, pos_mel): diff --git a/parakeet/models/transformerTTS/vocoder.py b/parakeet/models/transformer_tts/vocoder.py similarity index 57% rename from parakeet/models/transformerTTS/vocoder.py rename to parakeet/models/transformer_tts/vocoder.py index f8e66e1..690d4ce 100644 --- a/parakeet/models/transformerTTS/vocoder.py +++ b/parakeet/models/transformer_tts/vocoder.py @@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.modules.customized import Conv1D from parakeet.modules.utils import * -from parakeet.models.transformerTTS.CBHG import CBHG +from parakeet.models.transformer_tts.CBHG import CBHG class Vocoder(dg.Layer): """ CBHG Network (mel -> linear) """ - def __init__(self, config): + def __init__(self, config, batch_size): super(Vocoder, self).__init__() - self.pre_proj = Conv1D(num_channels = config.audio.num_mels, - num_filters = config.hidden_size, + self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], + num_filters = config['hidden_size'], filter_size=1) - self.cbhg = CBHG(config.hidden_size, config.batch_size) - self.post_proj = Conv1D(num_channels = config.hidden_size, - num_filters = (config.audio.n_fft // 2) + 1, + self.cbhg = CBHG(config['hidden_size'], batch_size) + self.post_proj = Conv1D(num_channels = config['hidden_size'], + num_filters = (config['audio']['n_fft'] // 2) + 1, filter_size=1) def forward(self, mel):