From 0327874f19dd5646d61d4e4ecd73556fd7aeaaa1 Mon Sep 17 00:00:00 2001 From: lfchener Date: Fri, 18 Dec 2020 19:59:34 +0800 Subject: [PATCH] add example for tacotron2 --- examples/tacotron2/config.py | 70 ++++++++++ examples/tacotron2/ljspeech.py | 106 ++++++++++++++++ examples/tacotron2/preprocess.py | 99 +++++++++++++++ examples/tacotron2/synthesize.py | 89 +++++++++++++ examples/tacotron2/train.py | 211 +++++++++++++++++++++++++++++++ 5 files changed, 575 insertions(+) create mode 100644 examples/tacotron2/config.py create mode 100644 examples/tacotron2/ljspeech.py create mode 100644 examples/tacotron2/preprocess.py create mode 100644 examples/tacotron2/synthesize.py create mode 100644 examples/tacotron2/train.py diff --git a/examples/tacotron2/config.py b/examples/tacotron2/config.py new file mode 100644 index 0000000..b14dbf9 --- /dev/null +++ b/examples/tacotron2/config.py @@ -0,0 +1,70 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from yacs.config import CfgNode as CN + +_C = CN() +_C.data = CN( + dict( + batch_size=32, # batch size + valid_size=64, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=1024, # fft frame size + win_length=1024, # window size + hop_length=256, # hop size between ajacent frame + f_max=8000, # Hz, max frequency when converting to mel + f_min=0, # Hz, min frequency when converting to mel + d_mels=80, # mel bands + padding_idx=0, # text embedding's padding index + )) + +_C.model = CN( + dict( + reduction_factor=1, # reduction factor + d_encoder=512, # embedding & encoder's internal size + encoder_conv_layers=3, # number of conv layer in tacotron2 encoder + encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder + d_prenet=256, # hidden size of decoder prenet + d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder + d_decoder_rnn=1024, #hidden size of the second rnn layer in tacotron2 decoder + d_attention=128, # hidden size of decoder location linear layer + attention_filters=32, # number of filter in decoder location conv layer + attention_kernel_size=31, # kernel size of decoder location conv layer + d_postnet=512, # hidden size of decoder postnet + postnet_kernel_size=5, # kernel size of conv layers in postnet + postnet_conv_layers=5, # number of conv layer in decoder postnet + p_encoder_dropout=0.5, # droput probability in encoder + p_prenet_dropout=0.5, # droput probability in decoder prenet + p_attention_dropout=0.1, # droput probability of first rnn layer in decoder + p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder + p_postnet_dropout=0.5, #droput probability in decoder postnet + )) + +_C.training = CN( + dict( + lr=1e-3, # learning rate + weight_decay=1e-6, # the coeff of weight decay + grad_clip_thresh=1.0, # the clip norm of grad clip. + plot_interval=1000, # plot attention and spectrogram + valid_interval=1000, # validation + save_interval=1000, # checkpoint + max_iteration=500000, # max iteration to train + )) + + +def get_cfg_defaults(): + """Get a yacs CfgNode object with default values for my_project.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + return _C.clone() diff --git a/examples/tacotron2/ljspeech.py b/examples/tacotron2/ljspeech.py new file mode 100644 index 0000000..dcc826b --- /dev/null +++ b/examples/tacotron2/ljspeech.py @@ -0,0 +1,106 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +import pickle +import numpy as np +from paddle.io import Dataset, DataLoader + +from parakeet.data.batch import batch_spec, batch_text_id +from parakeet.data import dataset + + +class LJSpeech(Dataset): + """A simple dataset adaptor for the processed ljspeech dataset.""" + + def __init__(self, root): + self.root = Path(root).expanduser() + records = [] + with open(self.root / "metadata.pkl", 'rb') as f: + metadata = pickle.load(f) + for mel_name, text, ids in metadata: + mel_name = self.root / "mel" / (mel_name + ".npy") + records.append((mel_name, text, ids)) + self.records = records + + def __getitem__(self, i): + mel_name, _, ids = self.records[i] + mel = np.load(mel_name) + return ids, mel + + def __len__(self): + return len(self.records) + + +class LJSpeechCollector(object): + """A simple callable to batch LJSpeech examples.""" + + def __init__(self, padding_idx=0, padding_value=0., + padding_stop_token=1.0): + self.padding_idx = padding_idx + self.padding_value = padding_value + self.padding_stop_token = padding_stop_token + + def __call__(self, examples): + texts = [] + mels = [] + text_lens = [] + mel_lens = [] + stop_tokens = [] + for data in examples: + text, mel = data + text = np.array(text, dtype=np.int64) + text_lens.append(len(text)) + mels.append(mel) + texts.append(text) + mel_lens.append(mel.shape[1]) + stop_token = np.zeros([mel.shape[1] - 1], dtype=np.float32) + stop_tokens.append(np.append(stop_token, 1.0)) + + # Sort by text_len in descending order + texts = [ + i + for i, _ in sorted( + zip(texts, text_lens), key=lambda x: x[1], reverse=True) + ] + mels = [ + i + for i, _ in sorted( + zip(mels, text_lens), key=lambda x: x[1], reverse=True) + ] + + mel_lens = [ + i + for i, _ in sorted( + zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) + ] + + stop_tokens = [ + i + for i, _ in sorted( + zip(stop_tokens, text_lens), key=lambda x: x[1], reverse=True) + ] + + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = batch_text_id(texts, pad_id=self.padding_idx) + mels = np.transpose( + batch_spec( + mels, pad_value=self.padding_value), axes=(0, 2, 1)) + stop_tokens = batch_text_id( + stop_tokens, pad_id=self.padding_stop_token, dtype=mels[0].dtype) + + return (texts, mels, text_lens, mel_lens, stop_tokens) diff --git a/examples/tacotron2/preprocess.py b/examples/tacotron2/preprocess.py new file mode 100644 index 0000000..b99c126 --- /dev/null +++ b/examples/tacotron2/preprocess.py @@ -0,0 +1,99 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tqdm +import pickle +import argparse +import numpy as np +from pathlib import Path + +from parakeet.datasets import LJSpeechMetaData +from parakeet.audio import AudioProcessor, LogMagnitude +from parakeet.frontend import EnglishCharacter + +from config import get_cfg_defaults + + +def create_dataset(config, source_path, target_path, verbose=False): + # create output dir + target_path = Path(target_path).expanduser() + mel_path = target_path / "mel" + os.makedirs(mel_path, exist_ok=True) + + meta_data = LJSpeechMetaData(source_path) + frontend = EnglishCharacter() + processor = AudioProcessor( + sample_rate=config.data.sample_rate, + n_fft=config.data.n_fft, + n_mels=config.data.d_mels, + win_length=config.data.win_length, + hop_length=config.data.hop_length, + f_max=config.data.f_max, + f_min=config.data.f_min) + normalizer = LogMagnitude() + + records = [] + for (fname, text, _) in tqdm.tqdm(meta_data): + wav = processor.read_wav(fname) + mel = processor.mel_spectrogram(wav) + mel = normalizer.transform(mel) + ids = frontend(text) + mel_name = os.path.splitext(os.path.basename(fname))[0] + + # save mel spectrogram + records.append((mel_name, text, ids)) + np.save(mel_path / mel_name, mel) + if verbose: + print("save mel spectrograms into {}".format(mel_path)) + + # save meta data as pickle archive + with open(target_path / "metadata.pkl", 'wb') as f: + pickle.dump(records, f) + if verbose: + print("saved metadata into {}".format(target_path / + "metadata.pkl")) + + print("Done.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="create dataset") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + + config = get_cfg_defaults() + args = parser.parse_args() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config.data) + + create_dataset(config, args.input, args.output, args.verbose) diff --git a/examples/tacotron2/synthesize.py b/examples/tacotron2/synthesize.py new file mode 100644 index 0000000..278e35f --- /dev/null +++ b/examples/tacotron2/synthesize.py @@ -0,0 +1,89 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from pathlib import Path +import numpy as np + +import paddle +import parakeet +from parakeet.frontend import EnglishCharacter +from parakeet.models.tacotron2 import Tacotron2 + +from config import get_cfg_defaults + + +def main(config, args): + paddle.set_device(args.device) + + # model + frontend = EnglishCharacter() + model = Tacotron2.from_pretrained(frontend, config, args.checkpoint_path) + model.eval() + + # inputs + input_path = Path(args.input).expanduser() + with open(input_path, "rt") as f: + sentences = f.readlines() + + if args.output is None: + output_dir = input_path.parent / "synthesis" + else: + output_dir = Path(args.output).expanduser() + output_dir.mkdir(exist_ok=True) + + for i, sentence in enumerate(sentences): + mel_output, _ = model.predict(sentence) + mel_output = mel_output.T + + np.save(str(output_dir / f"sentence_{i}"), mel_output) + if args.verbose: + print("spectrogram saved at {}".format(output_dir / + f"sentence_{i}.npy")) + + +if __name__ == "__main__": + config = get_cfg_defaults() + + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser.add_argument("--input", type=str, help="path of the text sentences") + parser.add_argument("--output", type=str, help="path to save outputs") + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + + args = parser.parse_args() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + print(args) + + main(config, args) diff --git a/examples/tacotron2/train.py b/examples/tacotron2/train.py new file mode 100644 index 0000000..bd635e6 --- /dev/null +++ b/examples/tacotron2/train.py @@ -0,0 +1,211 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from collections import defaultdict +import numpy as np + +import paddle +from paddle import distributed as dist +from paddle.io import DataLoader, DistributedBatchSampler + +import parakeet +from parakeet.data import dataset +from parakeet.frontend import EnglishCharacter +from parakeet.training.cli import default_argument_parser +from parakeet.training.experiment import ExperimentBase +from parakeet.utils import display, mp_tools +from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss + +from config import get_cfg_defaults +from ljspeech import LJSpeech, LJSpeechCollector + + +class Experiment(ExperimentBase): + def compute_losses(self, inputs, outputs): + _, mel_targets, _, _, stop_tokens = inputs + + mel_outputs = outputs["mel_output"] + mel_outputs_postnet = outputs["mel_outputs_postnet"] + stop_logits = outputs["stop_logits"] + + losses = self.criterion(mel_outputs, mel_outputs_postnet, stop_logits, + mel_targets, stop_tokens) + return losses + + def train_batch(self): + start = time.time() + batch = self.read_batch() + data_loader_time = time.time() - start + + self.optimizer.clear_grad() + self.model.train() + texts, mels, text_lens, output_lens, stop_tokens = batch + outputs = self.model(texts, mels, text_lens, output_lens) + losses = self.compute_losses(batch, outputs) + loss = losses["loss"] + loss.backward() + self.optimizer.step() + iteration_time = time.time() - start + + losses_np = {k: float(v) for k, v in losses.items()} + # logging + msg = "Rank: {}, ".format(dist.get_rank()) + msg += "step: {}, ".format(self.iteration) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_np.items()) + self.logger.info(msg) + + if dist.get_rank() == 0: + for k, v in losses_np.items(): + self.visualizer.add_scalar(f"train_loss/{k}", v, + self.iteration) + + @mp_tools.rank_zero_only + @paddle.no_grad() + def valid(self): + valid_losses = defaultdict(list) + for i, batch in enumerate(self.valid_loader): + texts, mels, text_lens, output_lens, stop_tokens = batch + outputs = self.model(texts, mels, text_lens, output_lens) + losses = self.compute_losses(batch, outputs) + for k, v in losses.items(): + valid_losses[k].append(float(v)) + + attention_weights = outputs["alignments"] + display.add_attention_plots(self.visualizer, + f"valid_sentence_{i}_alignments", + attention_weights[0], self.iteration) + display.add_spectrogram_plots( + self.visualizer, f"valid_sentence_{i}_target_spectrogram", + mels[0], self.iteration) + display.add_spectrogram_plots( + self.visualizer, f"valid_sentence_{i}_predicted_spectrogram", + outputs['mel_outputs_postnet'][0], self.iteration) + + # write visual log + valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} + + # logging + msg = "Valid: " + msg += "step: {}, ".format(self.iteration) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in valid_losses.items()) + self.logger.info(msg) + + for k, v in valid_losses.items(): + self.visualizer.add_scalar(f"valid/{k}", v, self.iteration) + + def setup_model(self): + config = self.config + frontend = EnglishCharacter() + model = Tacotron2( + frontend, + d_mels=config.data.d_mels, + d_encoder=config.model.d_encoder, + encoder_conv_layers=config.model.encoder_conv_layers, + encoder_kernel_size=config.model.encoder_kernel_size, + d_prenet=config.model.d_prenet, + d_attention_rnn=config.model.d_attention_rnn, + d_decoder_rnn=config.model.d_decoder_rnn, + attention_filters=config.model.attention_filters, + attention_kernel_size=config.model.attention_kernel_size, + d_attention=config.model.d_attention, + d_postnet=config.model.d_postnet, + postnet_kernel_size=config.model.postnet_kernel_size, + postnet_conv_layers=config.model.postnet_conv_layers, + reduction_factor=config.model.reduction_factor, + p_encoder_dropout=config.model.p_encoder_dropout, + p_prenet_dropout=config.model.p_prenet_dropout, + p_attention_dropout=config.model.p_attention_dropout, + p_decoder_dropout=config.model.p_decoder_dropout, + p_postnet_dropout=config.model.p_postnet_dropout) + + if self.parallel: + model = paddle.DataParallel(model) + + grad_clip = paddle.nn.ClipGradByGlobalNorm( + config.training.grad_clip_thresh) + optimizer = paddle.optimizer.Adam( + learning_rate=config.training.lr, + parameters=model.parameters(), + weight_decay=paddle.regularizer.L2Decay( + config.training.weight_decay), + grad_clip=grad_clip) + criterion = Tacotron2Loss() + self.model = model + self.optimizer = optimizer + self.criterion = criterion + + def setup_dataloader(self): + args = self.args + config = self.config + ljspeech_dataset = LJSpeech(args.data) + + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) + batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) + + if not self.parallel: + self.train_loader = DataLoader( + train_set, + batch_size=config.data.batch_size, + shuffle=True, + drop_last=True, + collate_fn=batch_fn) + else: + sampler = DistributedBatchSampler( + train_set, + batch_size=config.data.batch_size, + shuffle=True, + drop_last=True) + self.train_loader = DataLoader( + train_set, batch_sampler=sampler, collate_fn=batch_fn) + + self.valid_loader = DataLoader( + valid_set, + batch_size=config.data.batch_size, + shuffle=False, + drop_last=False, + collate_fn=batch_fn) + + +def main_sp(config, args): + exp = Experiment(config, args) + exp.setup() + exp.run() + + +def main(config, args): + if args.nprocs > 1 and args.device == "gpu": + dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + else: + main_sp(config, args) + + +if __name__ == "__main__": + config = get_cfg_defaults() + parser = default_argument_parser() + args = parser.parse_args() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + print(args) + + main(config, args)