transform parse to argparse

This commit is contained in:
lifuchen 2020-02-13 06:48:21 +00:00 committed by chenfeiyu
parent f5ac04b1a3
commit 04d7f8b598
30 changed files with 365 additions and 500 deletions

View File

@ -1,46 +0,0 @@
audio:
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
n_fft: 2048 #the number of fft components.
sr: 22050 #the sampling rate of audio data file.
preemphasis: 0.97 #the preemphasis coefficient.
hop_length: 256 #the number of samples to advance between frames.
win_length: 1024 #the length (width) of the window function.
power: 1.2 #the power to raise before griffin-lim.
min_level_db: -100 #the minimum level db.
ref_level_db: 20 #the reference level db.
outputs_per_step: 1 #the outputs per step.
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
fs_hidden_size: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
dropout: 0.1
transformer_head: 4
embedding_size: 512
hidden_size: 256
warm_up_step: 4000
grad_clip_thresh: 0.1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: True
data_path: ../../dataset/LJSpeech-1.1
transtts_path: ../TransformerTTS/checkpoint/
transformer_step: 160000
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#transformer_step: 97000

View File

@ -1,97 +0,0 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2,
help="the attention head number in encoder.")
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.")
parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.")
parser.add_argument('--fs_hidden_size', type=int, default=256,
help="the hidden size in model of fastspeech.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
help="the filter size of conv1d in duration prediction.")
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
help="the filter size of conv1d in fft.")
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
help="the padding size of conv1d in fft.")
parser.add_argument('--dropout', type=float, default=0.1,
help="the dropout in network.")
parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.")
parser.add_argument('--alpha', type=float, default=1.0,
help="the hyperparameter to determine the length of the expanded sequence\
mel, thereby controlling the voice speed.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model of transformerTTS.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=160000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=70000,
help="the step to load transformerTTS model.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -1,35 +0,0 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 1000
image_step: 2000
use_gpu: True
use_data_parallel: False
stop_token: False
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#ransformer_step: 97000

View File

@ -1,29 +0,0 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 10
use_gpu: True
use_data_parallel: True
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#transformer_step: 27000

View File

@ -1,69 +0,0 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in network.")
parser.add_argument('--embedding_size', type=int, default=512,
help="the embedding vector size.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer.")
parser.add_argument('--postnet_step', type=int, default=90000,
help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=bool, default=False,
help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./log',
help="the directory to save audio sample in synthesis.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -0,0 +1,32 @@
audio:
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
n_fft: 2048 #the number of fft components.
sr: 22050 #the sampling rate of audio data file.
preemphasis: 0.97 #the preemphasis coefficient.
hop_length: 256 #the number of samples to advance between frames.
win_length: 1024 #the length (width) of the window function.
power: 1.2 #the power to raise before griffin-lim.
min_level_db: -100 #the minimum level db.
ref_level_db: 20 #the reference level db.
outputs_per_step: 1 #the outputs per step.
encoder_n_layer: 6 #the number of FFT Block in encoder.
encoder_head: 2 #the attention head number in encoder.
encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder.
max_seq_len: 2048 #the max length of sequence.
decoder_n_layer: 6 #the number of FFT Block in decoder.
decoder_head: 2 #the attention head number in decoder.
decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder.
fs_hidden_size: 384 #the hidden size in model of fastspeech.
duration_predictor_output_size: 256 #the output size of duration predictior.
duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction.
fft_conv1d_filter: 3 #the filter size of conv1d in fft.
fft_conv1d_padding: 1 #the padding size of conv1d in fft.
dropout: 0.1 #the dropout in network.
transformer_head: 4 #the attention head num of transformerTTS.
embedding_size: 512 #the dim size of embedding of transformerTTS.
hidden_size: 256 #the hidden size in model of transformerTTS.
warm_up_step: 4000 #the warm up step of learning rate.
grad_clip_thresh: 0.1 #the threshold of grad clip.

View File

@ -13,7 +13,7 @@ audio:
encoder_n_layer: 6 encoder_n_layer: 6
encoder_head: 2 encoder_head: 2
encoder_conv1d_filter_size: 1536 encoder_conv1d_filter_size: 1536
max_sep_len: 2048 max_seq_len: 2048
decoder_n_layer: 6 decoder_n_layer: 6
decoder_head: 2 decoder_head: 2
decoder_conv1d_filter_size: 1536 decoder_conv1d_filter_size: 1536
@ -23,11 +23,4 @@ duration_predictor_filter_size: 3
fft_conv1d_filter: 3 fft_conv1d_filter: 3
fft_conv1d_padding: 1 fft_conv1d_padding: 1
dropout: 0.1 dropout: 0.1
transformer_head: 4 transformer_head: 4
use_gpu: True
alpha: 1.0
checkpoint_path: checkpoint/
fastspeech_step: 71000
log_dir: ./log

View File

@ -0,0 +1,36 @@
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="the step to load transformerTTS model.")

View File

@ -1,15 +1,16 @@
import os import os
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from collections import OrderedDict from collections import OrderedDict
import jsonargparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
from parakeet import audio from parakeet import audio
from network import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -21,19 +22,22 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, cfg): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard # tensorboard
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'synthesis') path = os.path.join(args.log_dir,'synthesis')
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech"))) model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
model.eval() model.eval()
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
@ -41,18 +45,18 @@ def synthesis(text_input, cfg):
pos_text = np.arange(1, text.shape[1]+1) pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=cfg.alpha) mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg.audio.sr, sample_rate=cfg['audio']['sr'],
num_mels=cfg.audio.num_mels, num_mels=cfg['audio']['num_mels'],
min_level_db=cfg.audio.min_level_db, min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg.audio.ref_level_db, ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg.audio.n_fft, n_fft=cfg['audio']['n_fft'],
win_length= cfg.audio.win_length, win_length= cfg['audio']['win_length'],
hop_length= cfg.audio.hop_length, hop_length= cfg['audio']['hop_length'],
power=cfg.audio.power, power=cfg['audio']['power'],
preemphasis=cfg.audio.preemphasis, preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
symmetric_norm=False, symmetric_norm=False,
max_norm=1., max_norm=1.,
@ -65,12 +69,12 @@ def synthesis(text_input, cfg):
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!") print("Synthesis completed !!!")
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) args = parser.parse_args()
synthesis("Transformer model is so fast!", cfg) synthesis("Transformer model is so fast!", args)

View File

@ -3,10 +3,10 @@ import argparse
import os import os
import time import time
import math import math
import jsonargparse
from pathlib import Path from pathlib import Path
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml
from tqdm import tqdm from tqdm import tqdm
from collections import OrderedDict from collections import OrderedDict
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.dataloader.ljspeech import LJSpeechLoader from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
from parakeet.models.fastspeech.fastspeech import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
from parakeet.models.fastspeech.utils import get_alignment from parakeet.models.fastspeech.utils import get_alignment
@ -28,50 +28,49 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(cfg): def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0: with open(args.config_path) as f:
# Print the whole config setting. cfg = yaml.load(f, Loader=yaml.Loader)
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0 global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'fastspeech') path = os.path.join(args.log_dir,'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg) transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer")) model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict) transformerTTS.set_dict(model_dict)
transformerTTS.eval() transformerTTS.eval()
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters()) parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech")) model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = cfg.fastspeech_step global_step = args.fastspeech_step
print("load checkpoint!!!") print("load checkpoint!!!")
if cfg.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(cfg.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
@ -79,7 +78,7 @@ def main(cfg):
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg.transformer_head)).astype(np.float32) alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
global_step += 1 global_step += 1
@ -101,20 +100,20 @@ def main(cfg):
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if cfg.use_data_parallel: if args.use_data_parallel:
total_loss = model.scale_loss(total_loss) total_loss = model.scale_loss(total_loss)
total_loss.backward() total_loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
total_loss.backward() total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0: if local_rank==0 and global_step % args.save_step == 0:
if not os.path.exists(cfg.save_path): if not os.path.exists(args.save_path):
os.mkdir(cfg.save_path) os.mkdir(args.save_path)
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step) save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank==0:
@ -122,7 +121,9 @@ def main(cfg):
if __name__ =='__main__': if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) args = parser.parse_args()
main(cfg) # Print the whole config setting.
pprint(args)
main(args)

View File

@ -0,0 +1,11 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1

View File

@ -10,11 +10,11 @@ audio:
ref_level_db: 20 ref_level_db: 20
outputs_per_step: 1 outputs_per_step: 1
max_len: 50
transformer_step: 10
postnet_step: 10
use_gpu: True
checkpoint_path: ./checkpoint hidden_size: 256
log_dir: ./log embedding_size: 512
sample_path: ./sample warm_up_step: 4000
grad_clip_thresh: 1.0

View File

@ -0,0 +1,16 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0

View File

@ -0,0 +1,38 @@
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer.")
parser.add_argument('--vocoder_step', type=int, default=90000,
help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=int, default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=int, default=0,
help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")

View File

@ -2,17 +2,19 @@ import os
from scipy.io.wavfile import write from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
import numpy as np import numpy as np
from network import TransformerTTS, ModelPostNet
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from ruamel import yaml
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from pathlib import Path from pathlib import Path
import jsonargparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from collections import OrderedDict from collections import OrderedDict
from parakeet import audio from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -24,25 +26,28 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, cfg): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
# tensorboard # tensorboard
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'synthesis') path = os.path.join(args.log_dir,'synthesis')
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "nostop_token/transformer"))) model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "nostop_token/transformer")))
model.eval() model.eval()
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model_postnet = ModelPostNet(cfg) model_postnet = Vocoder(cfg, args.batch_size)
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) model_postnet.set_dict(load_checkpoint(str(args.postnet_step), os.path.join(args.checkpoint_path, "postnet")))
model_postnet.eval() model_postnet.eval()
# init input # init input
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
@ -52,7 +57,7 @@ def synthesis(text_input, cfg):
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
pbar = tqdm(range(cfg.max_len)) pbar = tqdm(range(args.max_len))
for i in pbar: for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1) pos_mel = np.arange(1, mel_input.shape[1]+1)
@ -62,15 +67,15 @@ def synthesis(text_input, cfg):
mag_pred = model_postnet(postnet_pred) mag_pred = model_postnet(postnet_pred)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg.audio.sr, sample_rate=cfg['audio']['sr'],
num_mels=cfg.audio.num_mels, num_mels=cfg['audio']['num_mels'],
min_level_db=cfg.audio.min_level_db, min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg.audio.ref_level_db, ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg.audio.n_fft, n_fft=cfg['audio']['n_fft'],
win_length= cfg.audio.win_length, win_length= cfg['audio']['win_length'],
hop_length= cfg.audio.hop_length, hop_length= cfg['audio']['hop_length'],
power=cfg.audio.power, power=cfg['audio']['power'],
preemphasis=cfg.audio.preemphasis, preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
symmetric_norm=False, symmetric_norm=False,
max_norm=1., max_norm=1.,
@ -82,14 +87,14 @@ def synthesis(text_input, cfg):
sound_norm=False) sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(cfg.sample_path): if not os.path.exists(args.sample_path):
os.mkdir(cfg.sample_path) os.mkdir(args.sample_path)
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) args = parser.parse_args()
synthesis("Transformer model is so fast!", cfg) synthesis("Transformer model is so fast!", args)

View File

@ -3,9 +3,10 @@ from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from pathlib import Path from pathlib import Path
from collections import OrderedDict from collections import OrderedDict
import jsonargparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml
from matplotlib import cm from matplotlib import cm
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.utils import cross_entropy from parakeet.modules.utils import cross_entropy
from parakeet.models.dataloader.ljspeech import LJSpeechLoader from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
@ -26,22 +27,21 @@ def load_checkpoint(step, model_path):
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(cfg): def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0: with open(args.config_path) as f:
# Print the whole config setting. cfg = yaml.load(f, Loader=yaml.Loader)
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0 global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'transformer') path = os.path.join(args.log_dir,'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
@ -49,23 +49,23 @@ def main(cfg):
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters()) parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")) model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = cfg.transformer_step global_step = args.transformer_step
print("load checkpoint!!!") print("load checkpoint!!!")
if cfg.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(cfg.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d'%epoch)
@ -81,7 +81,7 @@ def main(cfg):
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work. # Note: When used stop token loss the learning did not work.
if cfg.stop_token: if args.stop_token:
stop_loss = cross_entropy(stop_preds, label) stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss loss = loss + stop_loss
@ -91,7 +91,7 @@ def main(cfg):
'post_mel_loss':post_mel_loss.numpy() 'post_mel_loss':post_mel_loss.numpy()
}, global_step) }, global_step)
if cfg.stop_token: if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
@ -101,7 +101,7 @@ def main(cfg):
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if global_step % cfg.image_step == 1: if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs): for i, prob in enumerate(attn_probs):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
@ -117,20 +117,20 @@ def main(cfg):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
if cfg.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0: if local_rank==0 and global_step % args.save_step == 0:
if not os.path.exists(cfg.save_path): if not os.path.exists(args.save_path):
os.mkdir(cfg.save_path) os.mkdir(args.save_path)
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step) save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank==0:
@ -138,7 +138,10 @@ def main(cfg):
if __name__ =='__main__': if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
main(cfg) args = parser.parse_args()
# Print the whole config setting.
pprint(args)
main(args)

View File

@ -3,14 +3,15 @@ import os
from tqdm import tqdm from tqdm import tqdm
from pathlib import Path from pathlib import Path
from collections import OrderedDict from collections import OrderedDict
import jsonargparse import argparse
from ruamel import yaml
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.models.dataloader.ljspeech import LJSpeechLoader from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
@ -22,48 +23,47 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(cfg): def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0: with open(args.config_path) as f:
# Print the whole config setting. cfg = yaml.load(f, Loader=yaml.Loader)
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0 global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'postnet') path = os.path.join(args.log_dir,'postnet')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = Vocoder(cfg) model = Vocoder(cfg, args.batch_size)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters()) parameter_list=model.parameters())
if cfg.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")) model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "postnet"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = cfg.postnet_step global_step = args.vocoder_step
print("load checkpoint!!!") print("load checkpoint!!!")
if cfg.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(cfg.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d'%epoch)
@ -75,13 +75,13 @@ def main(cfg):
mag_pred = model(mel) mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
if local_rank==0: if local_rank==0:
@ -89,10 +89,10 @@ def main(cfg):
'loss':loss.numpy(), 'loss':loss.numpy(),
}, global_step) }, global_step)
if global_step % cfg.save_step == 0: if global_step % args.save_step == 0:
if not os.path.exists(cfg.save_path): if not os.path.exists(args.save_path):
os.mkdir(cfg.save_path) os.mkdir(args.save_path)
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) save_path = os.path.join(args.save_path,'postnet/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
@ -100,7 +100,9 @@ def main(cfg):
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train postnet model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split()) args = parser.parse_args()
main(cfg) # Print the whole config setting.
pprint(args)
main(args)

View File

@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader: class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path) LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT) metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config) transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer) dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
assert config.batch_size % nranks == 0 assert args.batch_size % nranks == 0
each_bs = config.batch_size // nranks each_bs = args.batch_size // nranks
if is_vocoder: if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
else: else:
@ -63,15 +63,15 @@ class LJSpeech(object):
super(LJSpeech, self).__init__() super(LJSpeech, self).__init__()
self.config = config self.config = config
self._ljspeech_processor = audio.AudioProcessor( self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config.audio.sr, sample_rate=config['audio']['sr'],
num_mels=config.audio.num_mels, num_mels=config['audio']['num_mels'],
min_level_db=config.audio.min_level_db, min_level_db=config['audio']['min_level_db'],
ref_level_db=config.audio.ref_level_db, ref_level_db=config['audio']['ref_level_db'],
n_fft=config.audio.n_fft, n_fft=config['audio']['n_fft'],
win_length= config.audio.win_length, win_length= config['audio']['win_length'],
hop_length= config.audio.hop_length, hop_length= config['audio']['hop_length'],
power=config.audio.power, power=config['audio']['power'],
preemphasis=config.audio.preemphasis, preemphasis=config['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
symmetric_norm=False, symmetric_norm=False,
max_norm=1., max_norm=1.,

View File

@ -2,7 +2,7 @@ import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
from parakeet.models.transformerTTS.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder from parakeet.models.fastspeech.decoder import Decoder
@ -13,43 +13,43 @@ class FastSpeech(dg.Layer):
super(FastSpeech, self).__init__() super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len, len_max_seq=cfg['max_seq_len'],
n_layers=cfg.encoder_n_layer, n_layers=cfg['encoder_n_layer'],
n_head=cfg.encoder_head, n_head=cfg['encoder_head'],
d_k=cfg.fs_hidden_size // cfg.encoder_head, d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg.fs_hidden_size // cfg.encoder_head, d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg.fs_hidden_size, d_model=cfg['fs_hidden_size'],
d_inner=cfg.encoder_conv1d_filter_size, d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1) dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size, self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
out_channels=cfg.duration_predictor_output_size, out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg.duration_predictor_filter_size, filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg.dropout) dropout=cfg['dropout'])
self.decoder = Decoder(len_max_seq=cfg.max_sep_len, self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
n_layers=cfg.decoder_n_layer, n_layers=cfg['decoder_n_layer'],
n_head=cfg.decoder_head, n_head=cfg['decoder_head'],
d_k=cfg.fs_hidden_size // cfg.decoder_head, d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg.fs_hidden_size // cfg.decoder_head, d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg.fs_hidden_size, d_model=cfg['fs_hidden_size'],
d_inner=cfg.decoder_conv1d_filter_size, d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1) dropout=0.1)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg.fs_hidden_size) k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.mel_linear = dg.Linear(cfg.fs_hidden_size, self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
cfg.audio.num_mels * cfg.audio.outputs_per_step, cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
param_attr = self.weight, param_attr = self.weight,
bias_attr = self.bias,) bias_attr = self.bias,)
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels, self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
num_hidden=512, num_hidden=512,
filter_size=5, filter_size=5,
padding=int(5 / 2), padding=int(5 / 2),
num_conv=5, num_conv=5,
outputs_per_step=cfg.audio.outputs_per_step, outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True, use_cudnn=True,
dropout=0.1, dropout=0.1,
batchnorm_last=True) batchnorm_last=True)

View File

@ -4,8 +4,8 @@ import paddle.fluid as fluid
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformerTTS.prenet import PreNet from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformerTTS.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4): def __init__(self, num_hidden, config, num_head=4):
@ -20,7 +20,7 @@ class Decoder(dg.Layer):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False)) trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels, self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
hidden_size = num_hidden * 2, hidden_size = num_hidden * 2,
output_size = num_hidden, output_size = num_hidden,
dropout_rate=0.2) dropout_rate=0.2)
@ -38,17 +38,17 @@ class Decoder(dg.Layer):
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step, self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.stop_linear = dg.Linear(num_hidden, 1, self.stop_linear = dg.Linear(num_hidden, 1,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
filter_size = 5, padding = 4, num_conv=5, filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step, outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn = config.use_gpu) use_cudnn = True)
def forward(self, key, value, query, c_mask, positional): def forward(self, key, value, query, c_mask, positional):

View File

@ -3,10 +3,10 @@ import paddle.fluid as fluid
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformerTTS.encoderprenet import EncoderPrenet from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config, num_head=4): def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
@ -19,11 +19,11 @@ class Encoder(dg.Layer):
trainable=False)) trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden, num_hidden = num_hidden,
use_cudnn=config.use_gpu) use_cudnn=True)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)

View File

@ -1,13 +1,13 @@
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformerTTS.encoder import Encoder from parakeet.models.transformer_tts.encoder import Encoder
from parakeet.models.transformerTTS.decoder import Decoder from parakeet.models.transformer_tts.decoder import Decoder
class TransformerTTS(dg.Layer): class TransformerTTS(dg.Layer):
def __init__(self, config): def __init__(self, config):
super(TransformerTTS, self).__init__() super(TransformerTTS, self).__init__()
self.encoder = Encoder(config.embedding_size, config.hidden_size, config) self.encoder = Encoder(config['embedding_size'], config['hidden_size'])
self.decoder = Decoder(config.hidden_size, config) self.decoder = Decoder(config['hidden_size'], config)
self.config = config self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel): def forward(self, characters, mel_input, pos_text, pos_mel):

View File

@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D from parakeet.modules.customized import Conv1D
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.models.transformerTTS.CBHG import CBHG from parakeet.models.transformer_tts.CBHG import CBHG
class Vocoder(dg.Layer): class Vocoder(dg.Layer):
""" """
CBHG Network (mel -> linear) CBHG Network (mel -> linear)
""" """
def __init__(self, config): def __init__(self, config, batch_size):
super(Vocoder, self).__init__() super(Vocoder, self).__init__()
self.pre_proj = Conv1D(num_channels = config.audio.num_mels, self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'],
num_filters = config.hidden_size, num_filters = config['hidden_size'],
filter_size=1) filter_size=1)
self.cbhg = CBHG(config.hidden_size, config.batch_size) self.cbhg = CBHG(config['hidden_size'], batch_size)
self.post_proj = Conv1D(num_channels = config.hidden_size, self.post_proj = Conv1D(num_channels = config['hidden_size'],
num_filters = (config.audio.n_fft // 2) + 1, num_filters = (config['audio']['n_fft'] // 2) + 1,
filter_size=1) filter_size=1)
def forward(self, mel): def forward(self, mel):