transform parse to argparse
This commit is contained in:
parent
f5ac04b1a3
commit
04d7f8b598
|
@ -1,46 +0,0 @@
|
|||
audio:
|
||||
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
|
||||
n_fft: 2048 #the number of fft components.
|
||||
sr: 22050 #the sampling rate of audio data file.
|
||||
preemphasis: 0.97 #the preemphasis coefficient.
|
||||
hop_length: 256 #the number of samples to advance between frames.
|
||||
win_length: 1024 #the length (width) of the window function.
|
||||
power: 1.2 #the power to raise before griffin-lim.
|
||||
min_level_db: -100 #the minimum level db.
|
||||
ref_level_db: 20 #the reference level db.
|
||||
outputs_per_step: 1 #the outputs per step.
|
||||
|
||||
encoder_n_layer: 6
|
||||
encoder_head: 2
|
||||
encoder_conv1d_filter_size: 1536
|
||||
max_sep_len: 2048
|
||||
decoder_n_layer: 6
|
||||
decoder_head: 2
|
||||
decoder_conv1d_filter_size: 1536
|
||||
fs_hidden_size: 384
|
||||
duration_predictor_output_size: 256
|
||||
duration_predictor_filter_size: 3
|
||||
fft_conv1d_filter: 3
|
||||
fft_conv1d_padding: 1
|
||||
dropout: 0.1
|
||||
transformer_head: 4
|
||||
|
||||
embedding_size: 512
|
||||
hidden_size: 256
|
||||
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 0.1
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 500
|
||||
use_gpu: True
|
||||
use_data_parallel: True
|
||||
|
||||
data_path: ../../dataset/LJSpeech-1.1
|
||||
transtts_path: ../TransformerTTS/checkpoint/
|
||||
transformer_step: 160000
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
#checkpoint_path: ./checkpoint
|
||||
#transformer_step: 97000
|
|
@ -1,97 +0,0 @@
|
|||
import jsonargparse
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--audio.num_mels', type=int, default=80,
|
||||
help="the number of mel bands when calculating mel spectrograms.")
|
||||
parser.add_argument('--audio.n_fft', type=int, default=2048,
|
||||
help="the number of fft components.")
|
||||
parser.add_argument('--audio.sr', type=int, default=22050,
|
||||
help="the sampling rate of audio data file.")
|
||||
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
|
||||
help="the preemphasis coefficient.")
|
||||
parser.add_argument('--audio.hop_length', type=int, default=128,
|
||||
help="the number of samples to advance between frames.")
|
||||
parser.add_argument('--audio.win_length', type=int, default=1024,
|
||||
help="the length (width) of the window function.")
|
||||
parser.add_argument('--audio.power', type=float, default=1.4,
|
||||
help="the power to raise before griffin-lim.")
|
||||
parser.add_argument('--audio.min_level_db', type=int, default=-100,
|
||||
help="the minimum level db.")
|
||||
parser.add_argument('--audio.ref_level_db', type=int, default=20,
|
||||
help="the reference level db.")
|
||||
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||
help="the outputs per step.")
|
||||
|
||||
parser.add_argument('--encoder_n_layer', type=int, default=6,
|
||||
help="the number of FFT Block in encoder.")
|
||||
parser.add_argument('--encoder_head', type=int, default=2,
|
||||
help="the attention head number in encoder.")
|
||||
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
|
||||
help="the filter size of conv1d in encoder.")
|
||||
parser.add_argument('--max_sep_len', type=int, default=2048,
|
||||
help="the max length of sequence.")
|
||||
parser.add_argument('--decoder_n_layer', type=int, default=6,
|
||||
help="the number of FFT Block in decoder.")
|
||||
parser.add_argument('--decoder_head', type=int, default=2,
|
||||
help="the attention head number in decoder.")
|
||||
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
|
||||
help="the filter size of conv1d in decoder.")
|
||||
parser.add_argument('--fs_hidden_size', type=int, default=256,
|
||||
help="the hidden size in model of fastspeech.")
|
||||
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
|
||||
help="the output size of duration predictior.")
|
||||
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
|
||||
help="the filter size of conv1d in duration prediction.")
|
||||
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
|
||||
help="the filter size of conv1d in fft.")
|
||||
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
|
||||
help="the padding size of conv1d in fft.")
|
||||
parser.add_argument('--dropout', type=float, default=0.1,
|
||||
help="the dropout in network.")
|
||||
parser.add_argument('--transformer_head', type=int, default=4,
|
||||
help="the attention head num of transformerTTS.")
|
||||
parser.add_argument('--alpha', type=float, default=1.0,
|
||||
help="the hyperparameter to determine the length of the expanded sequence\
|
||||
mel, thereby controlling the voice speed.")
|
||||
|
||||
parser.add_argument('--hidden_size', type=int, default=256,
|
||||
help="the hidden size in model of transformerTTS.")
|
||||
parser.add_argument('--embedding_size', type=int, default=256,
|
||||
help="the dim size of embedding of transformerTTS.")
|
||||
|
||||
parser.add_argument('--warm_up_step', type=int, default=4000,
|
||||
help="the warm up step of learning rate.")
|
||||
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
|
||||
help="the threshold of grad clip.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--fastspeech_step', type=int, default=160000,
|
||||
help="Global step to restore checkpoint of fastspeech.")
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=bool, default=False,
|
||||
help="use data parallel or not during training.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
parser.add_argument('--transtts_path', type=str, default='./log',
|
||||
help="the directory to load pretrain transformerTTS model.")
|
||||
parser.add_argument('--transformer_step', type=int, default=70000,
|
||||
help="the step to load transformerTTS model.")
|
||||
|
||||
|
||||
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
|
|
@ -1,35 +0,0 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
|
||||
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 1000
|
||||
image_step: 2000
|
||||
use_gpu: True
|
||||
use_data_parallel: False
|
||||
stop_token: False
|
||||
|
||||
data_path: ../../dataset/LJSpeech-1.1
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
#checkpoint_path: ./checkpoint
|
||||
#ransformer_step: 97000
|
||||
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 10
|
||||
use_gpu: True
|
||||
use_data_parallel: True
|
||||
|
||||
data_path: ../../dataset/LJSpeech-1.1
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
#checkpoint_path: ./checkpoint
|
||||
#transformer_step: 27000
|
|
@ -1,69 +0,0 @@
|
|||
import jsonargparse
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--audio.num_mels', type=int, default=80,
|
||||
help="the number of mel bands when calculating mel spectrograms.")
|
||||
parser.add_argument('--audio.n_fft', type=int, default=2048,
|
||||
help="the number of fft components.")
|
||||
parser.add_argument('--audio.sr', type=int, default=22050,
|
||||
help="the sampling rate of audio data file.")
|
||||
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
|
||||
help="the preemphasis coefficient.")
|
||||
parser.add_argument('--audio.hop_length', type=int, default=128,
|
||||
help="the number of samples to advance between frames.")
|
||||
parser.add_argument('--audio.win_length', type=int, default=1024,
|
||||
help="the length (width) of the window function.")
|
||||
parser.add_argument('--audio.power', type=float, default=1.4,
|
||||
help="the power to raise before griffin-lim.")
|
||||
parser.add_argument('--audio.min_level_db', type=int, default=-100,
|
||||
help="the minimum level db.")
|
||||
parser.add_argument('--audio.ref_level_db', type=int, default=20,
|
||||
help="the reference level db.")
|
||||
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||
help="the outputs per step.")
|
||||
|
||||
parser.add_argument('--hidden_size', type=int, default=256,
|
||||
help="the hidden size in network.")
|
||||
parser.add_argument('--embedding_size', type=int, default=512,
|
||||
help="the embedding vector size.")
|
||||
|
||||
parser.add_argument('--warm_up_step', type=int, default=4000,
|
||||
help="the warm up step of learning rate.")
|
||||
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
|
||||
help="the threshold of grad clip.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--image_step', type=int, default=2000,
|
||||
help="attention image interval during training.")
|
||||
parser.add_argument('--max_len', type=int, default=400,
|
||||
help="The max length of audio when synthsis.")
|
||||
parser.add_argument('--transformer_step', type=int, default=160000,
|
||||
help="Global step to restore checkpoint of transformer.")
|
||||
parser.add_argument('--postnet_step', type=int, default=90000,
|
||||
help="Global step to restore checkpoint of postnet.")
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=bool, default=False,
|
||||
help="use data parallel or not during training.")
|
||||
parser.add_argument('--stop_token', type=bool, default=False,
|
||||
help="use stop token loss in network or not.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./log',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
|
||||
|
||||
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
|
|
@ -0,0 +1,32 @@
|
|||
audio:
|
||||
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
|
||||
n_fft: 2048 #the number of fft components.
|
||||
sr: 22050 #the sampling rate of audio data file.
|
||||
preemphasis: 0.97 #the preemphasis coefficient.
|
||||
hop_length: 256 #the number of samples to advance between frames.
|
||||
win_length: 1024 #the length (width) of the window function.
|
||||
power: 1.2 #the power to raise before griffin-lim.
|
||||
min_level_db: -100 #the minimum level db.
|
||||
ref_level_db: 20 #the reference level db.
|
||||
outputs_per_step: 1 #the outputs per step.
|
||||
|
||||
encoder_n_layer: 6 #the number of FFT Block in encoder.
|
||||
encoder_head: 2 #the attention head number in encoder.
|
||||
encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder.
|
||||
max_seq_len: 2048 #the max length of sequence.
|
||||
decoder_n_layer: 6 #the number of FFT Block in decoder.
|
||||
decoder_head: 2 #the attention head number in decoder.
|
||||
decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder.
|
||||
fs_hidden_size: 384 #the hidden size in model of fastspeech.
|
||||
duration_predictor_output_size: 256 #the output size of duration predictior.
|
||||
duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction.
|
||||
fft_conv1d_filter: 3 #the filter size of conv1d in fft.
|
||||
fft_conv1d_padding: 1 #the padding size of conv1d in fft.
|
||||
dropout: 0.1 #the dropout in network.
|
||||
transformer_head: 4 #the attention head num of transformerTTS.
|
||||
|
||||
embedding_size: 512 #the dim size of embedding of transformerTTS.
|
||||
hidden_size: 256 #the hidden size in model of transformerTTS.
|
||||
warm_up_step: 4000 #the warm up step of learning rate.
|
||||
grad_clip_thresh: 0.1 #the threshold of grad clip.
|
||||
|
|
@ -13,7 +13,7 @@ audio:
|
|||
encoder_n_layer: 6
|
||||
encoder_head: 2
|
||||
encoder_conv1d_filter_size: 1536
|
||||
max_sep_len: 2048
|
||||
max_seq_len: 2048
|
||||
decoder_n_layer: 6
|
||||
decoder_head: 2
|
||||
decoder_conv1d_filter_size: 1536
|
||||
|
@ -23,11 +23,4 @@ duration_predictor_filter_size: 3
|
|||
fft_conv1d_filter: 3
|
||||
fft_conv1d_padding: 1
|
||||
dropout: 0.1
|
||||
transformer_head: 4
|
||||
|
||||
use_gpu: True
|
||||
alpha: 1.0
|
||||
|
||||
checkpoint_path: checkpoint/
|
||||
fastspeech_step: 71000
|
||||
log_dir: ./log
|
||||
transformer_head: 4
|
|
@ -0,0 +1,36 @@
|
|||
import argparse
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
|
||||
help="the yaml config file path.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--fastspeech_step', type=int, default=70000,
|
||||
help="Global step to restore checkpoint of fastspeech.")
|
||||
parser.add_argument('--use_gpu', type=int, default=1,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=int, default=0,
|
||||
help="use data parallel or not during training.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
parser.add_argument('--transtts_path', type=str, default='./log',
|
||||
help="the directory to load pretrain transformerTTS model.")
|
||||
parser.add_argument('--transformer_step', type=int, default=160000,
|
||||
help="the step to load transformerTTS model.")
|
||||
|
||||
|
|
@ -1,15 +1,16 @@
|
|||
import os
|
||||
from tensorboardX import SummaryWriter
|
||||
from collections import OrderedDict
|
||||
import jsonargparse
|
||||
import argparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from ruamel import yaml
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
from parakeet import audio
|
||||
from network import FastSpeech
|
||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
|
@ -21,19 +22,22 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict
|
||||
|
||||
def synthesis(text_input, cfg):
|
||||
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
|
||||
def synthesis(text_input, args):
|
||||
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
# tensorboard
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'synthesis')
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'synthesis')
|
||||
|
||||
with open(args.config_path) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
writer = SummaryWriter(path)
|
||||
|
||||
with dg.guard(place):
|
||||
model = FastSpeech(cfg)
|
||||
model.set_dict(load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech")))
|
||||
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
|
||||
model.eval()
|
||||
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
|
@ -41,18 +45,18 @@ def synthesis(text_input, cfg):
|
|||
pos_text = np.arange(1, text.shape[1]+1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
||||
|
||||
mel_output, mel_output_postnet = model(text, pos_text, alpha=cfg.alpha)
|
||||
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
|
||||
|
||||
_ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=cfg.audio.sr,
|
||||
num_mels=cfg.audio.num_mels,
|
||||
min_level_db=cfg.audio.min_level_db,
|
||||
ref_level_db=cfg.audio.ref_level_db,
|
||||
n_fft=cfg.audio.n_fft,
|
||||
win_length= cfg.audio.win_length,
|
||||
hop_length= cfg.audio.hop_length,
|
||||
power=cfg.audio.power,
|
||||
preemphasis=cfg.audio.preemphasis,
|
||||
sample_rate=cfg['audio']['sr'],
|
||||
num_mels=cfg['audio']['num_mels'],
|
||||
min_level_db=cfg['audio']['min_level_db'],
|
||||
ref_level_db=cfg['audio']['ref_level_db'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length= cfg['audio']['win_length'],
|
||||
hop_length= cfg['audio']['hop_length'],
|
||||
power=cfg['audio']['power'],
|
||||
preemphasis=cfg['audio']['preemphasis'],
|
||||
signal_norm=True,
|
||||
symmetric_norm=False,
|
||||
max_norm=1.,
|
||||
|
@ -65,12 +69,12 @@ def synthesis(text_input, cfg):
|
|||
|
||||
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
|
||||
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
|
||||
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
|
||||
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
|
||||
print("Synthesis completed !!!")
|
||||
writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
|
||||
parser = argparse.ArgumentParser(description="Train Fastspeech model")
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
|
||||
synthesis("Transformer model is so fast!", cfg)
|
||||
args = parser.parse_args()
|
||||
synthesis("Transformer model is so fast!", args)
|
|
@ -3,10 +3,10 @@ import argparse
|
|||
import os
|
||||
import time
|
||||
import math
|
||||
import jsonargparse
|
||||
from pathlib import Path
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from ruamel import yaml
|
||||
from tqdm import tqdm
|
||||
from collections import OrderedDict
|
||||
from tensorboardX import SummaryWriter
|
||||
|
@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg
|
|||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
|
||||
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
|
||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||
from parakeet.models.fastspeech.utils import get_alignment
|
||||
|
||||
|
@ -28,50 +28,49 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict, opti_dict
|
||||
|
||||
def main(cfg):
|
||||
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||
def main(args):
|
||||
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
|
||||
|
||||
if local_rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||
with open(args.config_path) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
global_step = 0
|
||||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if cfg.use_gpu else fluid.CPUPlace())
|
||||
if args.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'fastspeech')
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'fastspeech')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
with fluid.unique_name.guard():
|
||||
transformerTTS = TransformerTTS(cfg)
|
||||
model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer"))
|
||||
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
|
||||
transformerTTS.set_dict(model_dict)
|
||||
transformerTTS.eval()
|
||||
|
||||
model = FastSpeech(cfg)
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
|
||||
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
if cfg.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech"))
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
global_step = cfg.fastspeech_step
|
||||
global_step = args.fastspeech_step
|
||||
print("load checkpoint!!!")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
if args.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
for epoch in range(cfg.epochs):
|
||||
for epoch in range(args.epochs):
|
||||
pbar = tqdm(reader)
|
||||
|
||||
for i, data in enumerate(pbar):
|
||||
|
@ -79,7 +78,7 @@ def main(cfg):
|
|||
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
|
||||
|
||||
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
|
||||
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg.transformer_head)).astype(np.float32)
|
||||
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
|
||||
|
||||
global_step += 1
|
||||
|
||||
|
@ -101,20 +100,20 @@ def main(cfg):
|
|||
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
if args.use_data_parallel:
|
||||
total_loss = model.scale_loss(total_loss)
|
||||
total_loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
total_loss.backward()
|
||||
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank==0 and global_step % cfg.save_step == 0:
|
||||
if not os.path.exists(cfg.save_path):
|
||||
os.mkdir(cfg.save_path)
|
||||
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step)
|
||||
if local_rank==0 and global_step % args.save_step == 0:
|
||||
if not os.path.exists(args.save_path):
|
||||
os.mkdir(args.save_path)
|
||||
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
if local_rank==0:
|
||||
|
@ -122,7 +121,9 @@ def main(cfg):
|
|||
|
||||
|
||||
if __name__ =='__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse')
|
||||
parser = argparse.ArgumentParser(description="Train Fastspeech model")
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c config/fastspeech.yaml'.split())
|
||||
main(cfg)
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(args)
|
||||
main(args)
|
|
@ -0,0 +1,11 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
|
@ -10,11 +10,11 @@ audio:
|
|||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
max_len: 50
|
||||
transformer_step: 10
|
||||
postnet_step: 10
|
||||
use_gpu: True
|
||||
|
||||
checkpoint_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
sample_path: ./sample
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
|
@ -0,0 +1,38 @@
|
|||
import argparse
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
|
||||
help="the yaml config file path.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--image_step', type=int, default=2000,
|
||||
help="attention image interval during training.")
|
||||
parser.add_argument('--max_len', type=int, default=400,
|
||||
help="The max length of audio when synthsis.")
|
||||
parser.add_argument('--transformer_step', type=int, default=160000,
|
||||
help="Global step to restore checkpoint of transformer.")
|
||||
parser.add_argument('--vocoder_step', type=int, default=90000,
|
||||
help="Global step to restore checkpoint of postnet.")
|
||||
parser.add_argument('--use_gpu', type=int, default=1,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=int, default=0,
|
||||
help="use data parallel or not during training.")
|
||||
parser.add_argument('--stop_token', type=int, default=0,
|
||||
help="use stop token loss in network or not.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||
help="the directory to save audio sample in synthesis.")
|
|
@ -2,17 +2,19 @@ import os
|
|||
from scipy.io.wavfile import write
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
import numpy as np
|
||||
from network import TransformerTTS, ModelPostNet
|
||||
from tqdm import tqdm
|
||||
from tensorboardX import SummaryWriter
|
||||
from ruamel import yaml
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from pathlib import Path
|
||||
import jsonargparse
|
||||
import argparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from collections import OrderedDict
|
||||
from parakeet import audio
|
||||
from parakeet.models.transformer_tts.vocoder import Vocoder
|
||||
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
|
@ -24,25 +26,28 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict
|
||||
|
||||
def synthesis(text_input, cfg):
|
||||
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
|
||||
def synthesis(text_input, args):
|
||||
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
with open(args.config_path) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
# tensorboard
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'synthesis')
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'synthesis')
|
||||
|
||||
writer = SummaryWriter(path)
|
||||
|
||||
with dg.guard(place):
|
||||
with fluid.unique_name.guard():
|
||||
model = TransformerTTS(cfg)
|
||||
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "nostop_token/transformer")))
|
||||
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "nostop_token/transformer")))
|
||||
model.eval()
|
||||
|
||||
with fluid.unique_name.guard():
|
||||
model_postnet = ModelPostNet(cfg)
|
||||
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
|
||||
model_postnet = Vocoder(cfg, args.batch_size)
|
||||
model_postnet.set_dict(load_checkpoint(str(args.postnet_step), os.path.join(args.checkpoint_path, "postnet")))
|
||||
model_postnet.eval()
|
||||
# init input
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
|
@ -52,7 +57,7 @@ def synthesis(text_input, cfg):
|
|||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
||||
|
||||
|
||||
pbar = tqdm(range(cfg.max_len))
|
||||
pbar = tqdm(range(args.max_len))
|
||||
|
||||
for i in pbar:
|
||||
pos_mel = np.arange(1, mel_input.shape[1]+1)
|
||||
|
@ -62,15 +67,15 @@ def synthesis(text_input, cfg):
|
|||
mag_pred = model_postnet(postnet_pred)
|
||||
|
||||
_ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=cfg.audio.sr,
|
||||
num_mels=cfg.audio.num_mels,
|
||||
min_level_db=cfg.audio.min_level_db,
|
||||
ref_level_db=cfg.audio.ref_level_db,
|
||||
n_fft=cfg.audio.n_fft,
|
||||
win_length= cfg.audio.win_length,
|
||||
hop_length= cfg.audio.hop_length,
|
||||
power=cfg.audio.power,
|
||||
preemphasis=cfg.audio.preemphasis,
|
||||
sample_rate=cfg['audio']['sr'],
|
||||
num_mels=cfg['audio']['num_mels'],
|
||||
min_level_db=cfg['audio']['min_level_db'],
|
||||
ref_level_db=cfg['audio']['ref_level_db'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length= cfg['audio']['win_length'],
|
||||
hop_length= cfg['audio']['hop_length'],
|
||||
power=cfg['audio']['power'],
|
||||
preemphasis=cfg['audio']['preemphasis'],
|
||||
signal_norm=True,
|
||||
symmetric_norm=False,
|
||||
max_norm=1.,
|
||||
|
@ -82,14 +87,14 @@ def synthesis(text_input, cfg):
|
|||
sound_norm=False)
|
||||
|
||||
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
|
||||
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
|
||||
if not os.path.exists(cfg.sample_path):
|
||||
os.mkdir(cfg.sample_path)
|
||||
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav)
|
||||
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
|
||||
if not os.path.exists(args.sample_path):
|
||||
os.mkdir(args.sample_path)
|
||||
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
|
||||
writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
|
||||
parser = argparse.ArgumentParser(description="Synthesis model")
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
|
||||
synthesis("Transformer model is so fast!", cfg)
|
||||
args = parser.parse_args()
|
||||
synthesis("Transformer model is so fast!", args)
|
|
@ -3,9 +3,10 @@ from tqdm import tqdm
|
|||
from tensorboardX import SummaryWriter
|
||||
from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
import jsonargparse
|
||||
import argparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from ruamel import yaml
|
||||
from matplotlib import cm
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
|
@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg
|
|||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.utils import cross_entropy
|
||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS
|
||||
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
|
@ -26,22 +27,21 @@ def load_checkpoint(step, model_path):
|
|||
return new_state_dict, opti_dict
|
||||
|
||||
|
||||
def main(cfg):
|
||||
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||
def main(args):
|
||||
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
|
||||
|
||||
if local_rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||
with open(args.config_path) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
global_step = 0
|
||||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if cfg.use_gpu else fluid.CPUPlace())
|
||||
if args.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'transformer')
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'transformer')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
|
@ -49,23 +49,23 @@ def main(cfg):
|
|||
model = TransformerTTS(cfg)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
if cfg.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))
|
||||
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
global_step = cfg.transformer_step
|
||||
global_step = args.transformer_step
|
||||
print("load checkpoint!!!")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
if args.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
for epoch in range(cfg.epochs):
|
||||
for epoch in range(args.epochs):
|
||||
pbar = tqdm(reader)
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
|
@ -81,7 +81,7 @@ def main(cfg):
|
|||
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||
loss = mel_loss + post_mel_loss
|
||||
# Note: When used stop token loss the learning did not work.
|
||||
if cfg.stop_token:
|
||||
if args.stop_token:
|
||||
stop_loss = cross_entropy(stop_preds, label)
|
||||
loss = loss + stop_loss
|
||||
|
||||
|
@ -91,7 +91,7 @@ def main(cfg):
|
|||
'post_mel_loss':post_mel_loss.numpy()
|
||||
}, global_step)
|
||||
|
||||
if cfg.stop_token:
|
||||
if args.stop_token:
|
||||
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
|
||||
|
||||
writer.add_scalars('alphas', {
|
||||
|
@ -101,7 +101,7 @@ def main(cfg):
|
|||
|
||||
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||
|
||||
if global_step % cfg.image_step == 1:
|
||||
if global_step % args.image_step == 1:
|
||||
for i, prob in enumerate(attn_probs):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
|
@ -117,20 +117,20 @@ def main(cfg):
|
|||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
if args.use_data_parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank==0 and global_step % cfg.save_step == 0:
|
||||
if not os.path.exists(cfg.save_path):
|
||||
os.mkdir(cfg.save_path)
|
||||
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step)
|
||||
if local_rank==0 and global_step % args.save_step == 0:
|
||||
if not os.path.exists(args.save_path):
|
||||
os.mkdir(args.save_path)
|
||||
save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
if local_rank==0:
|
||||
|
@ -138,7 +138,10 @@ def main(cfg):
|
|||
|
||||
|
||||
if __name__ =='__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse')
|
||||
parser = argparse.ArgumentParser(description="Train TransformerTTS model")
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
|
||||
main(cfg)
|
||||
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(args)
|
||||
main(args)
|
|
@ -3,14 +3,15 @@ import os
|
|||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
import jsonargparse
|
||||
import argparse
|
||||
from ruamel import yaml
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.models.dataloader.ljspeech import LJSpeechLoader
|
||||
from parakeet.models.transformerTTS.vocoder import Vocoder
|
||||
from parakeet.models.transformer_tts.vocoder import Vocoder
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
|
||||
|
@ -22,48 +23,47 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict, opti_dict
|
||||
|
||||
def main(cfg):
|
||||
def main(args):
|
||||
|
||||
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||
|
||||
if local_rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
|
||||
|
||||
with open(args.config_path) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
global_step = 0
|
||||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if cfg.use_gpu else fluid.CPUPlace())
|
||||
if args.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'postnet')
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'postnet')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
model = Vocoder(cfg)
|
||||
model = Vocoder(cfg, args.batch_size)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
|
||||
if cfg.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "postnet"))
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
global_step = cfg.postnet_step
|
||||
global_step = args.vocoder_step
|
||||
print("load checkpoint!!!")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
if args.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader()
|
||||
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
|
||||
|
||||
for epoch in range(cfg.epochs):
|
||||
for epoch in range(args.epochs):
|
||||
pbar = tqdm(reader)
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
|
@ -75,13 +75,13 @@ def main(cfg):
|
|||
mag_pred = model(mel)
|
||||
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
if args.use_data_parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
|
||||
model.clear_gradients()
|
||||
|
||||
if local_rank==0:
|
||||
|
@ -89,10 +89,10 @@ def main(cfg):
|
|||
'loss':loss.numpy(),
|
||||
}, global_step)
|
||||
|
||||
if global_step % cfg.save_step == 0:
|
||||
if not os.path.exists(cfg.save_path):
|
||||
os.mkdir(cfg.save_path)
|
||||
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step)
|
||||
if global_step % args.save_step == 0:
|
||||
if not os.path.exists(args.save_path):
|
||||
os.mkdir(args.save_path)
|
||||
save_path = os.path.join(args.save_path,'postnet/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
|
||||
|
@ -100,7 +100,9 @@ def main(cfg):
|
|||
writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
|
||||
parser = argparse.ArgumentParser(description="Train postnet model")
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split())
|
||||
main(cfg)
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(args)
|
||||
main(args)
|
|
@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
|||
from parakeet.data.dataset import DatasetMixin, TransformDataset
|
||||
|
||||
class LJSpeechLoader:
|
||||
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
|
||||
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
|
||||
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
|
||||
|
||||
LJSPEECH_ROOT = Path(config.data_path)
|
||||
LJSPEECH_ROOT = Path(args.data_path)
|
||||
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
||||
transformer = LJSpeech(config)
|
||||
dataset = TransformDataset(metadata, transformer)
|
||||
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert config.batch_size % nranks == 0
|
||||
each_bs = config.batch_size // nranks
|
||||
assert args.batch_size % nranks == 0
|
||||
each_bs = args.batch_size // nranks
|
||||
if is_vocoder:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
|
||||
else:
|
||||
|
@ -63,15 +63,15 @@ class LJSpeech(object):
|
|||
super(LJSpeech, self).__init__()
|
||||
self.config = config
|
||||
self._ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=config.audio.sr,
|
||||
num_mels=config.audio.num_mels,
|
||||
min_level_db=config.audio.min_level_db,
|
||||
ref_level_db=config.audio.ref_level_db,
|
||||
n_fft=config.audio.n_fft,
|
||||
win_length= config.audio.win_length,
|
||||
hop_length= config.audio.hop_length,
|
||||
power=config.audio.power,
|
||||
preemphasis=config.audio.preemphasis,
|
||||
sample_rate=config['audio']['sr'],
|
||||
num_mels=config['audio']['num_mels'],
|
||||
min_level_db=config['audio']['min_level_db'],
|
||||
ref_level_db=config['audio']['ref_level_db'],
|
||||
n_fft=config['audio']['n_fft'],
|
||||
win_length= config['audio']['win_length'],
|
||||
hop_length= config['audio']['hop_length'],
|
||||
power=config['audio']['power'],
|
||||
preemphasis=config['audio']['preemphasis'],
|
||||
signal_norm=True,
|
||||
symmetric_norm=False,
|
||||
max_norm=1.,
|
||||
|
|
|
@ -2,7 +2,7 @@ import math
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
from parakeet.models.transformerTTS.post_convnet import PostConvNet
|
||||
from parakeet.models.transformer_tts.post_convnet import PostConvNet
|
||||
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
|
||||
from parakeet.models.fastspeech.encoder import Encoder
|
||||
from parakeet.models.fastspeech.decoder import Decoder
|
||||
|
@ -13,43 +13,43 @@ class FastSpeech(dg.Layer):
|
|||
super(FastSpeech, self).__init__()
|
||||
|
||||
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
|
||||
len_max_seq=cfg.max_sep_len,
|
||||
n_layers=cfg.encoder_n_layer,
|
||||
n_head=cfg.encoder_head,
|
||||
d_k=cfg.fs_hidden_size // cfg.encoder_head,
|
||||
d_v=cfg.fs_hidden_size // cfg.encoder_head,
|
||||
d_model=cfg.fs_hidden_size,
|
||||
d_inner=cfg.encoder_conv1d_filter_size,
|
||||
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||
len_max_seq=cfg['max_seq_len'],
|
||||
n_layers=cfg['encoder_n_layer'],
|
||||
n_head=cfg['encoder_head'],
|
||||
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
|
||||
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
|
||||
d_model=cfg['fs_hidden_size'],
|
||||
d_inner=cfg['encoder_conv1d_filter_size'],
|
||||
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
|
||||
fft_conv1d_padding=cfg['fft_conv1d_padding'],
|
||||
dropout=0.1)
|
||||
self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size,
|
||||
out_channels=cfg.duration_predictor_output_size,
|
||||
filter_size=cfg.duration_predictor_filter_size,
|
||||
dropout=cfg.dropout)
|
||||
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
|
||||
n_layers=cfg.decoder_n_layer,
|
||||
n_head=cfg.decoder_head,
|
||||
d_k=cfg.fs_hidden_size // cfg.decoder_head,
|
||||
d_v=cfg.fs_hidden_size // cfg.decoder_head,
|
||||
d_model=cfg.fs_hidden_size,
|
||||
d_inner=cfg.decoder_conv1d_filter_size,
|
||||
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
|
||||
out_channels=cfg['duration_predictor_output_size'],
|
||||
filter_size=cfg['duration_predictor_filter_size'],
|
||||
dropout=cfg['dropout'])
|
||||
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
|
||||
n_layers=cfg['decoder_n_layer'],
|
||||
n_head=cfg['decoder_head'],
|
||||
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
|
||||
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
|
||||
d_model=cfg['fs_hidden_size'],
|
||||
d_inner=cfg['decoder_conv1d_filter_size'],
|
||||
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
|
||||
fft_conv1d_padding=cfg['fft_conv1d_padding'],
|
||||
dropout=0.1)
|
||||
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
|
||||
k = math.sqrt(1 / cfg.fs_hidden_size)
|
||||
k = math.sqrt(1 / cfg['fs_hidden_size'])
|
||||
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
|
||||
self.mel_linear = dg.Linear(cfg.fs_hidden_size,
|
||||
cfg.audio.num_mels * cfg.audio.outputs_per_step,
|
||||
self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
|
||||
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
|
||||
param_attr = self.weight,
|
||||
bias_attr = self.bias,)
|
||||
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
|
||||
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
padding=int(5 / 2),
|
||||
num_conv=5,
|
||||
outputs_per_step=cfg.audio.outputs_per_step,
|
||||
outputs_per_step=cfg['audio']['outputs_per_step'],
|
||||
use_cudnn=True,
|
||||
dropout=0.1,
|
||||
batchnorm_last=True)
|
||||
|
|
|
@ -4,8 +4,8 @@ import paddle.fluid as fluid
|
|||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||
from parakeet.models.transformerTTS.prenet import PreNet
|
||||
from parakeet.models.transformerTTS.post_convnet import PostConvNet
|
||||
from parakeet.models.transformer_tts.prenet import PreNet
|
||||
from parakeet.models.transformer_tts.post_convnet import PostConvNet
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self, num_hidden, config, num_head=4):
|
||||
|
@ -20,7 +20,7 @@ class Decoder(dg.Layer):
|
|||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
|
||||
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
|
||||
hidden_size = num_hidden * 2,
|
||||
output_size = num_hidden,
|
||||
dropout_rate=0.2)
|
||||
|
@ -38,17 +38,17 @@ class Decoder(dg.Layer):
|
|||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step,
|
||||
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.stop_linear = dg.Linear(num_hidden, 1,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
|
||||
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
|
||||
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
|
||||
filter_size = 5, padding = 4, num_conv=5,
|
||||
outputs_per_step=config.audio.outputs_per_step,
|
||||
use_cudnn = config.use_gpu)
|
||||
outputs_per_step=config['audio']['outputs_per_step'],
|
||||
use_cudnn = True)
|
||||
|
||||
def forward(self, key, value, query, c_mask, positional):
|
||||
|
|
@ -3,10 +3,10 @@ import paddle.fluid as fluid
|
|||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||
from parakeet.models.transformerTTS.encoderprenet import EncoderPrenet
|
||||
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, config, num_head=4):
|
||||
def __init__(self, embedding_size, num_hidden, num_head=4):
|
||||
super(Encoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
|
||||
|
@ -19,11 +19,11 @@ class Encoder(dg.Layer):
|
|||
trainable=False))
|
||||
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
||||
num_hidden = num_hidden,
|
||||
use_cudnn=config.use_gpu)
|
||||
use_cudnn=True)
|
||||
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
for i, layer in enumerate(self.layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformerTTS.encoder import Encoder
|
||||
from parakeet.models.transformerTTS.decoder import Decoder
|
||||
from parakeet.models.transformer_tts.encoder import Encoder
|
||||
from parakeet.models.transformer_tts.decoder import Decoder
|
||||
|
||||
class TransformerTTS(dg.Layer):
|
||||
def __init__(self, config):
|
||||
super(TransformerTTS, self).__init__()
|
||||
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
|
||||
self.decoder = Decoder(config.hidden_size, config)
|
||||
self.encoder = Encoder(config['embedding_size'], config['hidden_size'])
|
||||
self.decoder = Decoder(config['hidden_size'], config)
|
||||
self.config = config
|
||||
|
||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
|
@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg
|
|||
import paddle.fluid as fluid
|
||||
from parakeet.modules.customized import Conv1D
|
||||
from parakeet.modules.utils import *
|
||||
from parakeet.models.transformerTTS.CBHG import CBHG
|
||||
from parakeet.models.transformer_tts.CBHG import CBHG
|
||||
|
||||
class Vocoder(dg.Layer):
|
||||
"""
|
||||
CBHG Network (mel -> linear)
|
||||
"""
|
||||
def __init__(self, config):
|
||||
def __init__(self, config, batch_size):
|
||||
super(Vocoder, self).__init__()
|
||||
self.pre_proj = Conv1D(num_channels = config.audio.num_mels,
|
||||
num_filters = config.hidden_size,
|
||||
self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'],
|
||||
num_filters = config['hidden_size'],
|
||||
filter_size=1)
|
||||
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
||||
self.post_proj = Conv1D(num_channels = config.hidden_size,
|
||||
num_filters = (config.audio.n_fft // 2) + 1,
|
||||
self.cbhg = CBHG(config['hidden_size'], batch_size)
|
||||
self.post_proj = Conv1D(num_channels = config['hidden_size'],
|
||||
num_filters = (config['audio']['n_fft'] // 2) + 1,
|
||||
filter_size=1)
|
||||
|
||||
def forward(self, mel):
|
Loading…
Reference in New Issue