From e03e96d9e4c2b91eb3f4128b1c3a241bc08b72d3 Mon Sep 17 00:00:00 2001 From: iclementine Date: Sun, 20 Dec 2020 13:15:07 +0800 Subject: [PATCH] format all the code with yapf --- README_cn.md | 2 +- doc/source/conf.py | 19 +- docs/config_cn.md | 10 +- docs/data_cn.md | 14 +- docs/experiment_cn.md | 2 +- docs/experiment_guide_cn.md | 2 - docs/installation_cn.md | 2 +- docs/overview_cn.md | 7 - examples/transformer_tts/config.py | 56 ++-- examples/transformer_tts/ljspeech.py | 37 ++- examples/transformer_tts/preprocess.py | 50 +++- examples/transformer_tts/synthesize.py | 55 +++- examples/transformer_tts/train.py | 74 +++--- examples/waveflow/config.py | 58 +++-- examples/waveflow/ljspeech.py | 36 ++- examples/waveflow/preprocess.py | 52 ++-- examples/waveflow/synthesize.py | 54 +++- examples/waveflow/train.py | 48 +++- examples/wavenet/config.py | 55 ++-- examples/wavenet/ljspeech.py | 37 ++- examples/wavenet/preprocess.py | 54 ++-- examples/wavenet/synthesize.py | 53 +++- examples/wavenet/train.py | 55 ++-- parakeet/audio/audio.py | 34 +-- parakeet/audio/spec_normalizer.py | 26 +- parakeet/data/batch.py | 30 ++- parakeet/data/dataset.py | 14 +- parakeet/datasets/__init__.py | 14 + parakeet/datasets/common.py | 17 +- parakeet/datasets/ljspeech.py | 16 +- parakeet/frontend/__init__.py | 14 + parakeet/frontend/normalizer/__init__.py | 14 + parakeet/frontend/normalizer/abbrrviation.py | 14 + parakeet/frontend/normalizer/acronyms.py | 14 + parakeet/frontend/normalizer/width.py | 25 +- parakeet/frontend/punctuation.py | 26 +- parakeet/models/transformer_tts.py | 7 +- parakeet/models/waveflow.py | 257 +++++++++++-------- parakeet/models/wavenet.py | 135 +++++----- parakeet/modules/audio.py | 20 +- parakeet/modules/conv.py | 2 + parakeet/modules/geometry.py | 17 +- parakeet/modules/losses.py | 36 ++- parakeet/modules/masking.py | 15 ++ parakeet/modules/positional_encoding.py | 17 +- parakeet/modules/transformer.py | 74 +++--- parakeet/training/__init__.py | 14 + parakeet/training/cli.py | 17 +- parakeet/training/default_config.py | 26 +- parakeet/training/experiment.py | 1 + parakeet/utils/checkpoint.py | 15 +- parakeet/utils/internals.py | 14 + parakeet/utils/layer_tools.py | 4 + parakeet/utils/mp_tools.py | 21 +- parakeet/utils/scheduler.py | 25 +- setup.py | 14 +- 56 files changed, 1252 insertions(+), 569 deletions(-) diff --git a/README_cn.md b/README_cn.md index 994a4e2..ce88032 100644 --- a/README_cn.md +++ b/README_cn.md @@ -228,6 +228,6 @@ Parakeet 同时提供了示例模型的训练好的参数,可从下表中获 正在开发中。 -## 版权和许可 +## 版权和许可 Parakeet 以 [Apache-2.0 license](LICENSE) 提供。 diff --git a/doc/source/conf.py b/doc/source/conf.py index f7d0af2..dd4a270 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full @@ -14,7 +28,6 @@ # import sys # sys.path.insert(0, os.path.abspath('.')) - # -- Project information ----------------------------------------------------- project = 'parakeet' @@ -24,7 +37,6 @@ author = 'parakeet-developers' # The full version, including alpha/beta/rc tags release = '0.2' - # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be @@ -33,7 +45,7 @@ release = '0.2' extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', - "sphinx_rtd_theme", + "sphinx_rtd_theme", 'sphinx.ext.mathjax', 'numpydoc', ] @@ -46,7 +58,6 @@ templates_path = ['_templates'] # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] - # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/config_cn.md b/docs/config_cn.md index 2b8ce4c..29a80c6 100644 --- a/docs/config_cn.md +++ b/docs/config_cn.md @@ -18,7 +18,7 @@ 常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。 -`ini` +`ini` 优点:简单,支持字符串插值等操作。 缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。 @@ -102,11 +102,3 @@ optional arguments: --opts ... options to overwrite --config file and the default config, passing in KEY VALUE pairs ``` - - - - - - - - diff --git a/docs/data_cn.md b/docs/data_cn.md index 4a7aab8..6ef6404 100644 --- a/docs/data_cn.md +++ b/docs/data_cn.md @@ -21,7 +21,7 @@ 一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。 -parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset. +parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset. 1. 用于字段组合的有 TupleDataset, DictDataset; 2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset; @@ -137,7 +137,7 @@ class Transform(object): self.processor = AudioProcessor( sample_rate=22050, n_fft=1024, - win_length=1024, + win_length=1024, hop_length=256, f_max=8000) self.normalizer = LogMagnitude() @@ -167,7 +167,7 @@ ljspeech = TransformDataset(meta, transform) 当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。 -接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding. +接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding. ```python class LJSpeechCollector(object): @@ -197,10 +197,10 @@ def create_dataloader(source_path, valid_size, batch_size): valid_set, train_set = dataset.split(lj, valid_size) train_loader = DataLoader( - train_set, - return_list=False, - batch_size=batch_size, - shuffle=True, + train_set, + return_list=False, + batch_size=batch_size, + shuffle=True, drop_last=True, collate_fn=LJSpeechCollector()) valid_loader = DataLoader( diff --git a/docs/experiment_cn.md b/docs/experiment_cn.md index dc6a997..0596dda 100644 --- a/docs/experiment_cn.md +++ b/docs/experiment_cn.md @@ -72,4 +72,4 @@ def train(self): ```python exp.run() -``` \ No newline at end of file +``` diff --git a/docs/experiment_guide_cn.md b/docs/experiment_guide_cn.md index c5cc82e..8c9b89d 100644 --- a/docs/experiment_guide_cn.md +++ b/docs/experiment_guide_cn.md @@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+ ``` 在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。 - - diff --git a/docs/installation_cn.md b/docs/installation_cn.md index a861c86..030b721 100644 --- a/docs/installation_cn.md +++ b/docs/installation_cn.md @@ -31,7 +31,7 @@ python -m pip install paddlepaddle==2.0.0rc0 -i https://mirror.baidu.com/pypi/si # ubuntu, debian sudo apt-get install libsndfile1 -# centos, fedora, +# centos, fedora, sudo yum install libsndfile # openSUSE diff --git a/docs/overview_cn.md b/docs/overview_cn.md index 40659af..06a9f93 100644 --- a/docs/overview_cn.md +++ b/docs/overview_cn.md @@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了 1. 可复用的模型以及常用的模块; 2. 从数据处理,模型训练到预测等一系列过程的完整实验; 3. 高质量的开箱即用模型。 - - - - - - - diff --git a/examples/transformer_tts/config.py b/examples/transformer_tts/config.py index fef9ed8..bcf8e90 100644 --- a/examples/transformer_tts/config.py +++ b/examples/transformer_tts/config.py @@ -1,21 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from yacs.config import CfgNode as CN _C = CN() _C.data = CN( dict( - batch_size=16, # batch size - valid_size=64, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=1024, # fft frame size - win_length=1024, # window size + batch_size=16, # batch size + valid_size=64, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=1024, # fft frame size + win_length=1024, # window size hop_length=256, # hop size between ajacent frame - f_max=8000, # Hz, max frequency when converting to mel + f_max=8000, # Hz, max frequency when converting to mel d_mel=80, # mel bands - padding_idx=0, # text embedding's padding index - mel_start_value=0.5, # value for starting frame - mel_end_value=-0.5, # # value for ending frame - ) -) + padding_idx=0, # text embedding's padding index + mel_start_value=0.5, # value for starting frame + mel_end_value=-0.5, # # value for ending frame + )) _C.model = CN( dict( @@ -31,22 +44,21 @@ _C.model = CN( postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size max_reduction_factor=10, # max_reduction factor dropout=0.1, # global droput probability - stop_loss_scale=8.0, # scaler for stop _loss - decoder_prenet_dropout=0.5, # decoder prenet dropout probability - ) -) + stop_loss_scale=8.0, # scaler for stop _loss + decoder_prenet_dropout=0.5, # decoder prenet dropout probability + )) _C.training = CN( dict( - lr=1e-4, # learning rate + lr=1e-4, # learning rate drop_n_heads=[[0, 0], [15000, 1]], reduction_factor=[[0, 10], [80000, 4], [200000, 2]], - plot_interval=1000, # plot attention and spectrogram - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=900000, # max iteration to train - ) -) + plot_interval=1000, # plot attention and spectrogram + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=900000, # max iteration to train + )) + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/examples/transformer_tts/ljspeech.py b/examples/transformer_tts/ljspeech.py index 245b475..137db96 100644 --- a/examples/transformer_tts/ljspeech.py +++ b/examples/transformer_tts/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path import pickle @@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader from parakeet.data.batch import batch_spec, batch_text_id from parakeet.data import dataset + class LJSpeech(Dataset): """A simple dataset adaptor for the processed ljspeech dataset.""" + def __init__(self, root): self.root = Path(root).expanduser() records = [] @@ -35,13 +51,13 @@ class Transform(object): self.end_value = end_value def __call__(self, example): - ids, mel = example # ids already have and + ids, mel = example # ids already have and ids = np.array(ids, dtype=np.int64) # add start and end frame - mel = np.pad(mel, - [(0, 0), (1, 1)], - mode='constant', - constant_values=[(0, 0), (self.start_value, self.end_value)]) + mel = np.pad( + mel, [(0, 0), (1, 1)], + mode='constant', + constant_values=[(0, 0), (self.start_value, self.end_value)]) stop_labels = np.ones([mel.shape[1]], dtype=np.int64) stop_labels[-1] = 2 # actually this thing can also be done within the model @@ -50,6 +66,7 @@ class Transform(object): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" + def __init__(self, padding_idx=0, padding_value=0.): self.padding_idx = padding_idx self.padding_value = padding_value @@ -67,15 +84,16 @@ class LJSpeechCollector(object): def create_dataloader(config, source_path): lj = LJSpeech(source_path) - transform = Transform(config.data.mel_start_value, config.data.mel_end_value) + transform = Transform(config.data.mel_start_value, + config.data.mel_end_value) lj = dataset.TransformDataset(lj, transform) valid_set, train_set = dataset.split(lj, config.data.valid_size) data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx) train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=data_collator) valid_loader = DataLoader( @@ -85,4 +103,3 @@ def create_dataloader(config, source_path): drop_last=False, collate_fn=data_collator) return train_loader, valid_loader - diff --git a/examples/transformer_tts/preprocess.py b/examples/transformer_tts/preprocess.py index 001f04c..2ba1985 100644 --- a/examples/transformer_tts/preprocess.py +++ b/examples/transformer_tts/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import tqdm import pickle @@ -11,6 +25,7 @@ from parakeet.frontend import English from config import get_cfg_defaults + def create_dataset(config, source_path, target_path, verbose=False): # create output dir target_path = Path(target_path).expanduser() @@ -23,11 +38,11 @@ def create_dataset(config, source_path, target_path, verbose=False): sample_rate=config.data.sample_rate, n_fft=config.data.n_fft, n_mels=config.data.d_mel, - win_length=config.data.win_length, + win_length=config.data.win_length, hop_length=config.data.hop_length, f_max=config.data.f_max) normalizer = LogMagnitude() - + records = [] for (fname, text, _) in tqdm.tqdm(meta_data): wav = processor.read_wav(fname) @@ -42,12 +57,13 @@ def create_dataset(config, source_path, target_path, verbose=False): np.save(mel_path / mel_name, mel) if verbose: print("save mel spectrograms into {}".format(mel_path)) - + # save meta data as pickle archive with open(target_path / "metadata.pkl", 'wb') as f: pickle.dump(records, f) if verbose: - print("saved metadata into {}".format(target_path / "metadata.pkl")) + print("saved metadata into {}".format(target_path / + "metadata.pkl")) # also save meta data into text format for inspection with open(target_path / "metadata.txt", 'wt') as f: @@ -55,21 +71,31 @@ def create_dataset(config, source_path, target_path, verbose=False): phoneme_str = "|".join(phonemes) f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str)) if verbose: - print("saved metadata into {}".format(target_path / "metadata.txt")) - + print("saved metadata into {}".format(target_path / + "metadata.txt")) + print("Done.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="create dataset") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--input", type=str, help="path of the ljspeech dataset") - parser.add_argument("--output", type=str, help="path to save output dataset") - parser.add_argument("--opts", nargs=argparse.REMAINDER, + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + config = get_cfg_defaults() args = parser.parse_args() if args.config: diff --git a/examples/transformer_tts/synthesize.py b/examples/transformer_tts/synthesize.py index b8f352f..6758819 100644 --- a/examples/transformer_tts/synthesize.py +++ b/examples/transformer_tts/synthesize.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import time from pathlib import Path @@ -13,21 +27,22 @@ from parakeet.utils.display import add_attention_plots from config import get_cfg_defaults + @paddle.fluid.dygraph.no_grad def main(config, args): paddle.set_device(args.device) # model frontend = English() - model = TransformerTTS.from_pretrained( - frontend, config, args.checkpoint_path) + model = TransformerTTS.from_pretrained(frontend, config, + args.checkpoint_path) model.eval() # inputs input_path = Path(args.input).expanduser() - with open(input_path, "rt") as f: + with open(input_path, "rt") as f: sentences = f.readlines() - + output_dir = Path(args.output).expanduser() output_dir.mkdir(parents=True, exist_ok=True) @@ -38,22 +53,36 @@ def main(config, args): mel_output = mel_output.T #(C, T) np.save(str(output_dir / f"sentence_{i}"), mel_output) if args.verbose: - print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy")) + print("spectrogram saved at {}".format(output_dir / + f"sentence_{i}.npy")) + if __name__ == "__main__": config = get_cfg_defaults() - parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument("--input", type=str, help="path of the text sentences") parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument("--device", type=str, default="cpu", help="device type to use.") - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/examples/transformer_tts/train.py b/examples/transformer_tts/train.py index 59ec7aa..b5ae11d 100644 --- a/examples/transformer_tts/train.py +++ b/examples/transformer_tts/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time import logging from pathlib import Path @@ -19,12 +33,13 @@ from parakeet.training.experiment import ExperimentBase from config import get_cfg_defaults from ljspeech import LJSpeech, LJSpeechCollector, Transform + class Experiment(ExperimentBase): def setup_model(self): config = self.config frontend = English() model = TransformerTTS( - frontend, + frontend, d_encoder=config.model.d_encoder, d_decoder=config.model.d_decoder, d_mel=config.data.d_mel, @@ -46,8 +61,7 @@ class Experiment(ExperimentBase): beta1=0.9, beta2=0.98, epsilon=1e-9, - parameters=model.parameters() - ) + parameters=model.parameters()) criterion = TransformerTTSLoss(config.model.stop_loss_scale) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) reduction_factor = scheduler.StepWise(config.training.reduction_factor) @@ -63,21 +77,24 @@ class Experiment(ExperimentBase): config = self.config ljspeech_dataset = LJSpeech(args.data) - transform = Transform(config.data.mel_start_value, config.data.mel_end_value) - ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform) - valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) + transform = Transform(config.data.mel_start_value, + config.data.mel_end_value) + ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, + transform) + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) - + if not self.parallel: train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=batch_fn) else: sampler = DistributedBatchSampler( - train_set, + train_set, batch_size=config.data.batch_size, num_replicas=dist.get_world_size(), rank=dist.get_rank(), @@ -95,11 +112,11 @@ class Experiment(ExperimentBase): def compute_outputs(self, text, mel, stop_label): model_core = self.model._layers if self.parallel else self.model model_core.set_constants( - self.reduction_factor(self.iteration), + self.reduction_factor(self.iteration), self.drop_n_heads(self.iteration)) # TODO(chenfeiyu): we can combine these 2 slices - mel_input = mel[:,:-1, :] + mel_input = mel[:, :-1, :] reduced_mel_input = mel_input[:, ::model_core.r, :] outputs = self.model(text, reduced_mel_input) return outputs @@ -115,11 +132,8 @@ class Experiment(ExperimentBase): time_steps = mel_target.shape[1] losses = self.criterion( - mel_output[:,:time_steps, :], - mel_intermediate[:,:time_steps, :], - mel_target, - stop_logits[:,:time_steps, :], - stop_label_target) + mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :], + mel_target, stop_logits[:, :time_steps, :], stop_label_target) return losses def train_batch(self): @@ -133,7 +147,7 @@ class Experiment(ExperimentBase): outputs = self.compute_outputs(text, mel, stop_label) losses = self.compute_losses(batch, outputs) loss = losses["loss"] - loss.backward() + loss.backward() self.optimizer.step() iteration_time = time.time() - start @@ -141,14 +155,17 @@ class Experiment(ExperimentBase): # logging msg = "Rank: {}, ".format(dist.get_rank()) msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) - msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_np.items()) self.logger.info(msg) - + if dist.get_rank() == 0: for k, v in losses_np.items(): - self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) - + self.visualizer.add_scalar(f"train_loss/{k}", v, + self.iteration) + @mp_tools.rank_zero_only @paddle.no_grad() def valid(self): @@ -163,10 +180,9 @@ class Experiment(ExperimentBase): if i < 2: attention_weights = outputs["cross_attention_weights"] display.add_multi_attention_plots( - self.visualizer, - f"valid_sentence_{i}_cross_attention_weights", - attention_weights, - self.iteration) + self.visualizer, + f"valid_sentence_{i}_cross_attention_weights", + attention_weights, self.iteration) # write visual log valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} @@ -191,7 +207,7 @@ if __name__ == "__main__": config = get_cfg_defaults() parser = default_argument_parser() args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/examples/waveflow/config.py b/examples/waveflow/config.py index 97a877a..5ca2ba1 100644 --- a/examples/waveflow/config.py +++ b/examples/waveflow/config.py @@ -1,40 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from yacs.config import CfgNode as CN _C = CN() _C.data = CN( dict( - batch_size=8, # batch size - valid_size=16, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=1024, # fft frame size - win_length=1024, # window size + batch_size=8, # batch size + valid_size=16, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=1024, # fft frame size + win_length=1024, # window size hop_length=256, # hop size between ajacent frame - f_max=8000, # Hz, max frequency when converting to mel + f_max=8000, # Hz, max frequency when converting to mel n_mels=80, # mel bands - clip_frames=65, # mel clip frames - ) -) + clip_frames=65, # mel clip frames + )) _C.model = CN( dict( upsample_factors=[16, 16], - n_flows=8, # number of flows in WaveFlow - n_layers=8, # number of conv block in each flow - n_group=16, # folding factor of audio and spectrogram - channels=128, # resiaudal channel in each flow - kernel_size=[3, 3], # kernel size in each conv block - sigma=1.0, # stddev of the random noise - ) -) + n_flows=8, # number of flows in WaveFlow + n_layers=8, # number of conv block in each flow + n_group=16, # folding factor of audio and spectrogram + channels=128, # resiaudal channel in each flow + kernel_size=[3, 3], # kernel size in each conv block + sigma=1.0, # stddev of the random noise + )) _C.training = CN( dict( - lr=2e-4, # learning rates - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=3000000, # max iteration to train - ) -) + lr=2e-4, # learning rates + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=3000000, # max iteration to train + )) + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/examples/waveflow/ljspeech.py b/examples/waveflow/ljspeech.py index d7f5425..e07303a 100644 --- a/examples/waveflow/ljspeech.py +++ b/examples/waveflow/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path import pickle @@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav from parakeet.data import dataset from parakeet.audio import AudioProcessor + class LJSpeech(Dataset): """A simple dataset adaptor for the processed ljspeech dataset.""" + def __init__(self, root): self.root = Path(root).expanduser() meta_data = pandas.read_csv( str(self.root / "metadata.csv"), sep="\t", header=None, - names=["fname", "frames", "samples"] - ) - + names=["fname", "frames", "samples"]) + records = [] - for row in meta_data.itertuples() : + for row in meta_data.itertuples(): mel_path = str(self.root / "mel" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy")) records.append((mel_path, wav_path)) @@ -39,6 +54,7 @@ class LJSpeech(Dataset): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" + def __init__(self, padding_value=0.): self.padding_value = padding_value @@ -52,9 +68,9 @@ class LJSpeechCollector(object): class LJSpeechClipCollector(object): def __init__(self, clip_frames=65, hop_length=256): - self.clip_frames = clip_frames + self.clip_frames = clip_frames self.hop_length = hop_length - + def __call__(self, examples): mels = [] wavs = [] @@ -70,9 +86,7 @@ class LJSpeechClipCollector(object): mel, wav = example frames = mel.shape[-1] start = np.random.randint(0, frames - self.clip_frames) - mel_clip = mel[:, start: start + self.clip_frames] - wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] + mel_clip = mel[:, start:start + self.clip_frames] + wav_clip = wav[start * self.hop_length:(start + self.clip_frames) * + self.hop_length] return mel_clip, wav_clip - - - diff --git a/examples/waveflow/preprocess.py b/examples/waveflow/preprocess.py index d4bdc8e..ac6d62e 100644 --- a/examples/waveflow/preprocess.py +++ b/examples/waveflow/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import tqdm import csv @@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True): output_dir = Path(output_dir).expanduser() output_dir.mkdir(exist_ok=True) - transform = Transform( - config.sample_rate, - config.n_fft, - config.win_length, - config.hop_length, - config.n_mels) + transform = Transform(config.sample_rate, config.n_fft, config.win_length, + config.hop_length, config.n_mels) file_names = [] for example in tqdm.tqdm(dataset): @@ -107,23 +117,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True): np.save(str(mel_dir / base_name), mel) file_names.append((base_name, mel.shape[-1], audio.shape[-1])) - + meta_data = pd.DataFrame.from_records(file_names) - meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) - print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) + meta_data.to_csv( + str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) + print("saved meta data in to {}".format( + os.path.join(output_dir, "metadata.csv"))) print("Done!") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="create dataset") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--input", type=str, help="path of the ljspeech dataset") - parser.add_argument("--output", type=str, help="path to save output dataset") - parser.add_argument("--opts", nargs=argparse.REMAINDER, + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + config = get_cfg_defaults() args = parser.parse_args() if args.config: diff --git a/examples/waveflow/synthesize.py b/examples/waveflow/synthesize.py index 1856eb2..45c751a 100644 --- a/examples/waveflow/synthesize.py +++ b/examples/waveflow/synthesize.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import numpy as np import soundfile as sf @@ -8,9 +22,9 @@ import parakeet from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow from parakeet.utils import layer_tools, checkpoint - from config import get_cfg_defaults + def main(config, args): paddle.set_device(args.device) model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path) @@ -23,7 +37,8 @@ def main(config, args): for file_path in mel_dir.iterdir(): mel = np.load(str(file_path)) audio = model.predict(mel) - audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") + audio_path = output_dir / ( + os.path.splitext(file_path.name)[0] + ".wav") sf.write(audio_path, audio, config.data.sample_rate) print("[synthesize] {} -> {}".format(file_path, audio_path)) @@ -31,17 +46,32 @@ def main(config, args): if __name__ == "__main__": config = get_cfg_defaults() - parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") - parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser.add_argument( + "--input", + type=str, + help="path of directory containing mel spectrogram (in .npy format)") parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument("--device", type=str, default="cpu", help="device type to use.") - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) @@ -49,4 +79,4 @@ if __name__ == "__main__": print(config) print(args) - main(config, args) \ No newline at end of file + main(config, args) diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py index 1cd68f0..443cc8b 100644 --- a/examples/waveflow/train.py +++ b/examples/waveflow/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time from pathlib import Path import numpy as np @@ -34,7 +48,8 @@ class Experiment(ExperimentBase): if self.parallel > 1: model = paddle.DataParallel(model) - optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters()) + optimizer = paddle.optimizer.Adam( + config.training.lr, parameters=model.parameters()) criterion = WaveFlowLoss(sigma=config.model.sigma) self.model = model @@ -46,20 +61,22 @@ class Experiment(ExperimentBase): args = self.args ljspeech_dataset = LJSpeech(args.data) - valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) + + batch_fn = LJSpeechClipCollector(config.data.clip_frames, + config.data.hop_length) - batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length) - if not self.parallel: train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=batch_fn) else: sampler = DistributedBatchSampler( - train_set, + train_set, batch_size=config.data.batch_size, num_replicas=dist.get_world_size(), rank=dist.get_rank(), @@ -71,7 +88,7 @@ class Experiment(ExperimentBase): valid_batch_fn = LJSpeechCollector() valid_loader = DataLoader( valid_set, batch_size=1, collate_fn=valid_batch_fn) - + self.train_loader = train_loader self.valid_loader = valid_loader @@ -90,17 +107,19 @@ class Experiment(ExperimentBase): mel, wav = batch z, log_det_jocobian = self.compute_outputs(mel, wav) loss = self.criterion(z, log_det_jocobian) - loss.backward() + loss.backward() self.optimizer.step() iteration_time = time.time() - start loss_value = float(loss) msg = "Rank: {}, ".format(dist.get_rank()) msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) msg += "loss: {:>.6f}".format(loss_value) self.logger.info(msg) - self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) + self.visualizer.add_scalar( + "train/loss", loss_value, global_step=self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() @@ -112,7 +131,8 @@ class Experiment(ExperimentBase): loss = self.criterion(z, log_det_jocobian) valid_losses.append(float(loss)) valid_loss = np.mean(valid_losses) - self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) + self.visualizer.add_scalar( + "valid/loss", valid_loss, global_step=self.iteration) def main_sp(config, args): @@ -132,7 +152,7 @@ if __name__ == "__main__": config = get_cfg_defaults() parser = default_argument_parser() args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/examples/wavenet/config.py b/examples/wavenet/config.py index 58f9beb..658d416 100644 --- a/examples/wavenet/config.py +++ b/examples/wavenet/config.py @@ -1,19 +1,32 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from yacs.config import CfgNode as CN _C = CN() _C.data = CN( dict( - batch_size=8, # batch size - valid_size=16, # the first N examples are reserved for validation - sample_rate=22050, # Hz, sample rate - n_fft=2048, # fft frame size - win_length=1024, # window size + batch_size=8, # batch size + valid_size=16, # the first N examples are reserved for validation + sample_rate=22050, # Hz, sample rate + n_fft=2048, # fft frame size + win_length=1024, # window size hop_length=256, # hop size between ajacent frame # f_max=8000, # Hz, max frequency when converting to mel n_mels=80, # mel bands - train_clip_seconds=0.5, # audio clip length(in seconds) - ) -) + train_clip_seconds=0.5, # audio clip length(in seconds) + )) _C.model = CN( dict( @@ -21,24 +34,22 @@ _C.model = CN( n_stack=3, n_loop=10, filter_size=2, - residual_channels=128, # resiaudal channel in each flow + residual_channels=128, # resiaudal channel in each flow loss_type="mog", - output_dim=3, # single gaussian - log_scale_min=-9.0, - ) -) + output_dim=3, # single gaussian + log_scale_min=-9.0, )) _C.training = CN( dict( - lr=1e-3, # learning rates - anneal_rate=0.5, # learning rate decay rate - anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=3000000, # max iteration to train - gradient_max_norm=100.0 # global norm of gradients - ) -) + lr=1e-3, # learning rates + anneal_rate=0.5, # learning rate decay rate + anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=3000000, # max iteration to train + gradient_max_norm=100.0 # global norm of gradients + )) + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/examples/wavenet/ljspeech.py b/examples/wavenet/ljspeech.py index 18dc388..d1d3c67 100644 --- a/examples/wavenet/ljspeech.py +++ b/examples/wavenet/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os from pathlib import Path import pickle @@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav from parakeet.data import dataset from parakeet.audio import AudioProcessor + class LJSpeech(Dataset): """A simple dataset adaptor for the processed ljspeech dataset.""" + def __init__(self, root): self.root = Path(root).expanduser() meta_data = pandas.read_csv( str(self.root / "metadata.csv"), sep="\t", header=None, - names=["fname", "frames", "samples"] - ) - + names=["fname", "frames", "samples"]) + records = [] - for row in meta_data.itertuples() : + for row in meta_data.itertuples(): mel_path = str(self.root / "mel" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy")) records.append((mel_path, wav_path)) @@ -39,6 +54,7 @@ class LJSpeech(Dataset): class LJSpeechCollector(object): """A simple callable to batch LJSpeech examples.""" + def __init__(self, padding_value=0.): self.padding_value = padding_value @@ -48,15 +64,15 @@ class LJSpeechCollector(object): wavs = [example[1] for example in examples] mels = batch_spec(mels, pad_value=self.padding_value) wavs = batch_wav(wavs, pad_value=self.padding_value) - audio_starts = np.zeros((batch_size,), dtype=np.int64) + audio_starts = np.zeros((batch_size, ), dtype=np.int64) return mels, wavs, audio_starts class LJSpeechClipCollector(object): def __init__(self, clip_frames=65, hop_length=256): - self.clip_frames = clip_frames + self.clip_frames = clip_frames self.hop_length = hop_length - + def __call__(self, examples): mels = [] wavs = [] @@ -75,7 +91,8 @@ class LJSpeechClipCollector(object): mel, wav = example frames = mel.shape[-1] start = np.random.randint(0, frames - self.clip_frames) - wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] + wav_clip = wav[start * self.hop_length:(start + self.clip_frames) * + self.hop_length] return mel, wav_clip, start @@ -132,7 +149,3 @@ class DataCollector(object): audios = np.array(audios, dtype=np.float32) audio_starts = np.array(audio_starts, dtype=np.int64) return audios, mels, audio_starts - - - - diff --git a/examples/wavenet/preprocess.py b/examples/wavenet/preprocess.py index 29b140c..cc83727 100644 --- a/examples/wavenet/preprocess.py +++ b/examples/wavenet/preprocess.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import tqdm import csv @@ -23,7 +37,7 @@ class Transform(object): self.win_length = win_length self.hop_length = hop_length self.n_mels = n_mels - + self.spec_normalizer = UnitMagnitude(min=1e-5) def __call__(self, example): @@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True): output_dir = Path(output_dir).expanduser() output_dir.mkdir(exist_ok=True) - transform = Transform( - config.sample_rate, - config.n_fft, - config.win_length, - config.hop_length, - config.n_mels) + transform = Transform(config.sample_rate, config.n_fft, config.win_length, + config.hop_length, config.n_mels) file_names = [] for example in tqdm.tqdm(dataset): @@ -108,23 +118,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True): np.save(str(mel_dir / base_name), mel) file_names.append((base_name, mel.shape[-1], audio.shape[-1])) - + meta_data = pd.DataFrame.from_records(file_names) - meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) - print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) + meta_data.to_csv( + str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) + print("saved meta data in to {}".format( + os.path.join(output_dir, "metadata.csv"))) print("Done!") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="create dataset") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--input", type=str, help="path of the ljspeech dataset") - parser.add_argument("--output", type=str, help="path to save output dataset") - parser.add_argument("--opts", nargs=argparse.REMAINDER, + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--input", type=str, help="path of the ljspeech dataset") + parser.add_argument( + "--output", type=str, help="path to save output dataset") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" ) - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + config = get_cfg_defaults() args = parser.parse_args() if args.config: diff --git a/examples/wavenet/synthesize.py b/examples/wavenet/synthesize.py index 80b96a2..c5a69fe 100644 --- a/examples/wavenet/synthesize.py +++ b/examples/wavenet/synthesize.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import numpy as np import soundfile as sf @@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint from config import get_cfg_defaults + def main(config, args): paddle.set_device(args.device) model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path) @@ -22,7 +37,8 @@ def main(config, args): for file_path in mel_dir.iterdir(): mel = np.load(str(file_path)) audio = model.predict(mel) - audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") + audio_path = output_dir / ( + os.path.splitext(file_path.name)[0] + ".wav") sf.write(audio_path, audio, config.data.sample_rate) print("[synthesize] {} -> {}".format(file_path, audio_path)) @@ -30,17 +46,32 @@ def main(config, args): if __name__ == "__main__": config = get_cfg_defaults() - parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") - parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") - parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") + parser = argparse.ArgumentParser( + description="generate mel spectrogram with TransformerTTS.") + parser.add_argument( + "--config", + type=str, + metavar="FILE", + help="extra config to overwrite the default config") + parser.add_argument( + "--checkpoint_path", type=str, help="path of the checkpoint to load.") + parser.add_argument( + "--input", + type=str, + help="path of directory containing mel spectrogram (in .npy format)") parser.add_argument("--output", type=str, help="path to save outputs") - parser.add_argument("--device", type=str, default="cpu", help="device type to use.") - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("-v", "--verbose", action="store_true", help="print msg") - + parser.add_argument( + "--device", type=str, default="cpu", help="device type to use.") + parser.add_argument( + "--opts", + nargs=argparse.REMAINDER, + help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="print msg") + args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) @@ -48,4 +79,4 @@ if __name__ == "__main__": print(config) print(args) - main(config, args) \ No newline at end of file + main(config, args) diff --git a/examples/wavenet/train.py b/examples/wavenet/train.py index 77c54e3..8e9bc0e 100644 --- a/examples/wavenet/train.py +++ b/examples/wavenet/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time from pathlib import Path import math @@ -26,7 +40,7 @@ class Experiment(ExperimentBase): config = self.config model = ConditionalWaveNet( upsample_factors=config.model.upsample_factors, - n_stack=config.model.n_stack, + n_stack=config.model.n_stack, n_loop=config.model.n_loop, residual_channels=config.model.residual_channels, output_dim=config.model.output_dim, @@ -39,13 +53,13 @@ class Experiment(ExperimentBase): model = paddle.DataParallel(model) lr_scheduler = paddle.optimizer.lr.StepDecay( - config.training.lr, - config.training.anneal_interval, + config.training.lr, config.training.anneal_interval, config.training.anneal_rate) optimizer = paddle.optimizer.Adam( lr_scheduler, parameters=model.parameters(), - grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm)) + grad_clip=paddle.nn.ClipGradByGlobalNorm( + config.training.gradient_max_norm)) self.model = model self.model_core = model._layer if self.parallel else model @@ -56,7 +70,8 @@ class Experiment(ExperimentBase): args = self.args ljspeech_dataset = LJSpeech(args.data) - valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) + valid_set, train_set = dataset.split(ljspeech_dataset, + config.data.valid_size) # convolutional net's causal padding size context_size = config.model.n_stack \ @@ -66,20 +81,21 @@ class Experiment(ExperimentBase): # frames used to compute loss frames_per_second = config.data.sample_rate // config.data.hop_length - train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second) - + train_clip_frames = math.ceil(config.data.train_clip_seconds * + frames_per_second) + num_frames = train_clip_frames + context_frames batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length) if not self.parallel: train_loader = DataLoader( - train_set, - batch_size=config.data.batch_size, - shuffle=True, + train_set, + batch_size=config.data.batch_size, + shuffle=True, drop_last=True, collate_fn=batch_fn) else: sampler = DistributedBatchSampler( - train_set, + train_set, batch_size=config.data.batch_size, shuffle=True, drop_last=True) @@ -89,7 +105,7 @@ class Experiment(ExperimentBase): valid_batch_fn = LJSpeechCollector() valid_loader = DataLoader( valid_set, batch_size=1, collate_fn=valid_batch_fn) - + self.train_loader = train_loader self.valid_loader = valid_loader @@ -101,20 +117,22 @@ class Experiment(ExperimentBase): self.model.train() self.optimizer.clear_grad() mel, wav, audio_starts = batch - + y = self.model(wav, mel, audio_starts) loss = self.model.loss(y, wav) - loss.backward() + loss.backward() self.optimizer.step() iteration_time = time.time() - start loss_value = float(loss) msg = "Rank: {}, ".format(dist.get_rank()) msg += "step: {}, ".format(self.iteration) - msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) + msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, + iteration_time) msg += "loss: {:>.6f}".format(loss_value) self.logger.info(msg) - self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) + self.visualizer.add_scalar( + "train/loss", loss_value, global_step=self.iteration) @mp_tools.rank_zero_only @paddle.no_grad() @@ -126,7 +144,8 @@ class Experiment(ExperimentBase): loss = self.model.loss(y, wav) valid_losses.append(float(loss)) valid_loss = np.mean(valid_losses) - self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) + self.visualizer.add_scalar( + "valid/loss", valid_loss, global_step=self.iteration) def main_sp(config, args): @@ -146,7 +165,7 @@ if __name__ == "__main__": config = get_cfg_defaults() parser = default_argument_parser() args = parser.parse_args() - if args.config: + if args.config: config.merge_from_file(args.config) if args.opts: config.merge_from_list(args.opts) diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index 93d4e6b..3795111 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -18,15 +18,16 @@ import numpy as np __all__ = ["AudioProcessor"] + class AudioProcessor(object): def __init__(self, - sample_rate:int, - n_fft:int, - win_length:int, - hop_length:int, - n_mels:int=80, - f_min:int=0, - f_max:int=None, + sample_rate: int, + n_fft: int, + win_length: int, + hop_length: int, + n_mels: int=80, + f_min: int=0, + f_max: int=None, window="hann", center=True, pad_mode="reflect"): @@ -40,7 +41,7 @@ class AudioProcessor(object): self.window = window self.center = center self.pad_mode = pad_mode - + # mel self.n_mels = n_mels self.f_min = f_min @@ -48,19 +49,18 @@ class AudioProcessor(object): self.mel_filter = self._create_mel_filter() self.inv_mel_filter = np.linalg.pinv(self.mel_filter) - + def _create_mel_filter(self): - mel_filter = librosa.filters.mel( - self.sample_rate, - self.n_fft, - n_mels=self.n_mels, - fmin=self.f_min, - fmax=self.f_max) + mel_filter = librosa.filters.mel(self.sample_rate, + self.n_fft, + n_mels=self.n_mels, + fmin=self.f_min, + fmax=self.f_max) return mel_filter def read_wav(self, filename): # resampling may occur - wav, _ = librosa.load(filename, sr=self.sample_rate) + wav, _ = librosa.load(filename, sr=self.sample_rate) return wav def write_wav(self, path, wav): @@ -69,7 +69,7 @@ class AudioProcessor(object): def stft(self, wav): D = librosa.core.stft( wav, - n_fft = self.n_fft, + n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, diff --git a/parakeet/audio/spec_normalizer.py b/parakeet/audio/spec_normalizer.py index 08cea1b..069c453 100644 --- a/parakeet/audio/spec_normalizer.py +++ b/parakeet/audio/spec_normalizer.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This modules contains normalizers for spectrogram magnitude. @@ -19,22 +32,24 @@ __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"] class NormalizerBase(object): def transform(self, spec): raise NotImplementedError("transform must be implemented") - + def inverse(self, normalized): raise NotImplementedError("inverse must be implemented") + class LogMagnitude(NormalizerBase): """ This is a simple normalizer used in Waveglow, Waveflow, tacotron2... """ + def __init__(self, min=1e-7): self.min = min - + def transform(self, x): x = np.maximum(x, self.min) x = np.log(x) return x - + def inverse(self, x): return np.exp(x) @@ -44,15 +59,16 @@ class UnitMagnitude(NormalizerBase): """ This is the normalizer used in the """ + def __init__(self, min=1e-5): self.min = min - + def transform(self, x): db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20 normalized = (db_scale + 100) / 100 clipped = np.clip(normalized, 0, 1) return clipped - + def inverse(self, x): denormalized = np.clip(x, 0, 1) * 100 - 100 out = np.exp((denormalized + 20) / 20 * np.log(10)) diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 1551124..4c5be61 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided. import numpy as np __all__ = [ - "batch_text_id", "batch_wav", "batch_spec", - "TextIDBatcher", "WavBatcher", "SpecBatcher", + "batch_text_id", + "batch_wav", + "batch_spec", + "TextIDBatcher", + "WavBatcher", + "SpecBatcher", ] + class TextIDBatcher(object): """A wrapper class for `batch_text_id`.""" @@ -99,8 +104,8 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): pad_len = max_len - example.shape[-1] batch.append( np.pad(example, [(0, pad_len)], - mode='constant', - constant_values=pad_value)) + mode='constant', + constant_values=pad_value)) return np.array(batch, dtype=dtype) @@ -113,7 +118,11 @@ class SpecBatcher(object): self.time_major = time_major def __call__(self, minibatch): - out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype) + out = batch_spec( + minibatch, + pad_value=self.pad_value, + time_major=self.time_major, + dtype=self.dtype) return out @@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32): """ # assume (F, T) or (T, F) peek_example = minibatch[0] - assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram" + assert len( + peek_example.shape) == 2, "we only handles mono channel spectrogram" # assume (F, n_frame) or (n_frame, F) time_idx = 0 if time_major else -1 @@ -143,11 +153,11 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32): if time_major: batch.append( np.pad(example, [(0, pad_len), (0, 0)], - mode='constant', - constant_values=pad_value)) + mode='constant', + constant_values=pad_value)) else: batch.append( np.pad(example, [(0, 0), (0, pad_len)], - mode='constant', - constant_values=pad_value)) + mode='constant', + constant_values=pad_value)) return np.array(batch, dtype=dtype) diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index de9b40c..a188767 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -17,17 +17,25 @@ import paddle from paddle.io import Dataset __all__ = [ - "split", "TransformDataset", "CacheDataset", "TupleDataset", - "DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset", + "split", + "TransformDataset", + "CacheDataset", + "TupleDataset", + "DictDataset", + "SliceDataset", + "SubsetDataset", + "FilterDataset", "ChainDataset", ] + def split(dataset, first_size): """A utility function to split a dataset into two datasets.""" first = SliceDataset(dataset, 0, first_size) second = SliceDataset(dataset, first_size, len(dataset)) return first, second + class TransformDataset(Dataset): def __init__(self, dataset, transform): """Dataset which is transformed from another with a transform. @@ -141,7 +149,7 @@ class DictDataset(Dataset): for i in six.moves.range(length)] else: return batches - + def __len__(self): return self._length diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py index de7be70..e75da0b 100644 --- a/parakeet/datasets/__init__.py +++ b/parakeet/datasets/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.datasets.common import * from parakeet.datasets.ljspeech import * \ No newline at end of file diff --git a/parakeet/datasets/common.py b/parakeet/datasets/common.py index e0d91a3..a1d16d6 100644 --- a/parakeet/datasets/common.py +++ b/parakeet/datasets/common.py @@ -1,9 +1,24 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from paddle.io import Dataset import os import librosa __all__ = ["AudioFolderDataset"] + class AudioFolderDataset(Dataset): def __init__(self, path, sample_rate, extension="wav"): self.root = os.path.expanduser(path) @@ -19,5 +34,5 @@ class AudioFolderDataset(Dataset): def __getitem__(self, i): file_name = self.file_names[i] - y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable + y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable return y diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py index 9c2e0c3..a37863f 100644 --- a/parakeet/datasets/ljspeech.py +++ b/parakeet/datasets/ljspeech.py @@ -1,8 +1,23 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from paddle.io import Dataset from pathlib import Path __all__ = ["LJSpeechMetaData"] + class LJSpeechMetaData(Dataset): def __init__(self, root): self.root = Path(root).expanduser() @@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset): def __len__(self): return len(self.records) - diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py index cee73c1..2d06dda 100644 --- a/parakeet/frontend/__init__.py +++ b/parakeet/frontend/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.frontend.vocab import * from parakeet.frontend.phonectic import * from parakeet.frontend.punctuation import * diff --git a/parakeet/frontend/normalizer/__init__.py b/parakeet/frontend/normalizer/__init__.py index f098650..37fd580 100644 --- a/parakeet/frontend/normalizer/__init__.py +++ b/parakeet/frontend/normalizer/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.frontend.normalizer.normalizer import * from parakeet.frontend.normalizer.numbers import * diff --git a/parakeet/frontend/normalizer/abbrrviation.py b/parakeet/frontend/normalizer/abbrrviation.py index e69de29..9118340 100644 --- a/parakeet/frontend/normalizer/abbrrviation.py +++ b/parakeet/frontend/normalizer/abbrrviation.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/parakeet/frontend/normalizer/acronyms.py b/parakeet/frontend/normalizer/acronyms.py index e69de29..9118340 100644 --- a/parakeet/frontend/normalizer/acronyms.py +++ b/parakeet/frontend/normalizer/acronyms.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/parakeet/frontend/normalizer/width.py b/parakeet/frontend/normalizer/width.py index 440557f..b1598af 100644 --- a/parakeet/frontend/normalizer/width.py +++ b/parakeet/frontend/normalizer/width.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + def full2half_width(ustr): half = [] for u in ustr: num = ord(u) - if num == 0x3000: # 全角空格变半角 + if num == 0x3000: # 全角空格变半角 num = 32 elif 0xFF01 <= num <= 0xFF5E: num -= 0xfee0 @@ -10,15 +24,16 @@ def full2half_width(ustr): half.append(u) return ''.join(half) + def half2full_width(ustr): full = [] for u in ustr: num = ord(u) - if num == 32: # 半角空格变全角 + if num == 32: # 半角空格变全角 num = 0x3000 elif 0x21 <= num <= 0x7E: num += 0xfee0 - u = chr(num) # to unicode + u = chr(num) # to unicode full.append(u) - - return ''.join(full) \ No newline at end of file + + return ''.join(full) diff --git a/parakeet/frontend/punctuation.py b/parakeet/frontend/punctuation.py index 9984970..099e759 100644 --- a/parakeet/frontend/punctuation.py +++ b/parakeet/frontend/punctuation.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import abc import string @@ -13,15 +27,8 @@ EN_PUNCT = [ "!", ] -CN_PUNCT = [ - "、", - ",", - ";", - ":", - "。", - "?", - "!" -] +CN_PUNCT = ["、", ",", ";", ":", "。", "?", "!"] + def get_punctuations(lang): if lang == "en": @@ -30,4 +37,3 @@ def get_punctuations(lang): return CN_PUNCT else: raise ValueError(f"language {lang} Not supported") - diff --git a/parakeet/models/transformer_tts.py b/parakeet/models/transformer_tts.py index f84a9f8..c7f0ccd 100644 --- a/parakeet/models/transformer_tts.py +++ b/parakeet/models/transformer_tts.py @@ -559,7 +559,7 @@ class TransformerTTS(nn.Layer): @classmethod def from_pretrained(cls, frontend, config, checkpoint_path): model = TransformerTTS( - frontend, + frontend, d_encoder=config.model.d_encoder, d_decoder=config.model.d_decoder, d_mel=config.data.d_mel, @@ -575,11 +575,12 @@ class TransformerTTS(nn.Layer): decoder_prenet_dropout=config.model.decoder_prenet_dropout, dropout=config.model.dropout) - iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) + iteration = checkpoint.load_parameters( + model, checkpoint_path=checkpoint_path) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) reduction_factor = scheduler.StepWise(config.training.reduction_factor) model.set_constants( - reduction_factor=reduction_factor(iteration), + reduction_factor=reduction_factor(iteration), drop_n_heads=drop_n_heads(iteration)) return model diff --git a/parakeet/models/waveflow.py b/parakeet/models/waveflow.py index d58127b..625e61f 100644 --- a/parakeet/models/waveflow.py +++ b/parakeet/models/waveflow.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math import numpy as np from typing import List, Union, Tuple @@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] + def fold(x, n_group): r"""Fold audio or spectrogram's temporal dimension in to groups. @@ -31,6 +46,7 @@ def fold(x, n_group): new_shape = spatial_shape + [time_steps // n_group, n_group] return paddle.reshape(x, new_shape) + class UpsampleNet(nn.LayerList): """Layer to upsample mel spectrogram to the same temporal resolution with the corresponding waveform. @@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList): --------- ``librosa.core.stft`` """ + def __init__(self, upsample_factors): super(UpsampleNet, self).__init__() for factor in upsample_factors: @@ -67,16 +84,18 @@ class UpsampleNet(nn.LayerList): init = I.Uniform(-std, std) self.append( nn.utils.weight_norm( - nn.Conv2DTranspose(1, 1, (3, 2 * factor), + nn.Conv2DTranspose( + 1, + 1, (3, 2 * factor), padding=(1, factor // 2), stride=(1, factor), weight_attr=init, bias_attr=init))) - + # upsample factors self.upsample_factor = np.prod(upsample_factors) self.upsample_factors = upsample_factors - + def forward(self, x, trim_conv_artifact=False): r"""Forward pass of the ``UpsampleNet``. @@ -131,38 +150,47 @@ class ResidualBlock(nn.Layer): dilations : int Dilations of the Convolution2d applied to the input. """ + def __init__(self, channels, cond_channels, kernel_size, dilations): super(ResidualBlock, self).__init__() # input conv std = math.sqrt(1 / channels * np.prod(kernel_size)) init = I.Uniform(-std, std) - receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)] + receptive_field = [ + 1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations) + ] rh, rw = receptive_field - paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same - conv = nn.Conv2D(channels, 2 * channels, kernel_size, - padding=paddings, - dilation=dilations, - weight_attr=init, - bias_attr=init) + paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same + conv = nn.Conv2D( + channels, + 2 * channels, + kernel_size, + padding=paddings, + dilation=dilations, + weight_attr=init, + bias_attr=init) self.conv = nn.utils.weight_norm(conv) self.rh = rh self.rw = rw self.dilations = dilations - + # condition projection std = math.sqrt(1 / cond_channels) init = I.Uniform(-std, std) - condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1), - weight_attr=init, bias_attr=init) + condition_proj = nn.Conv2D( + cond_channels, + 2 * channels, (1, 1), + weight_attr=init, + bias_attr=init) self.condition_proj = nn.utils.weight_norm(condition_proj) - + # parametric residual & skip connection std = math.sqrt(1 / channels) init = I.Uniform(-std, std) - out_proj = nn.Conv2D(channels, 2 * channels, (1, 1), - weight_attr=init, bias_attr=init) + out_proj = nn.Conv2D( + channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init) self.out_proj = nn.utils.weight_norm(out_proj) - + def forward(self, x, condition): """Compute output for a whole folded sequence. @@ -185,10 +213,10 @@ class ResidualBlock(nn.Layer): x_in = x x = self.conv(x) x += self.condition_proj(condition) - + content, gate = paddle.chunk(x, 2, axis=1) x = paddle.tanh(content) * F.sigmoid(gate) - + x = self.out_proj(x) res, skip = paddle.chunk(x, 2, axis=1) res = x_in + res @@ -249,7 +277,7 @@ class ResidualBlock(nn.Layer): content, gate = paddle.chunk(x_row, 2, axis=1) x_row = paddle.tanh(content) * F.sigmoid(gate) - + x_row = self.out_proj(x_row) res, skip = paddle.chunk(x_row, 2, axis=1) res = x_row_in + res @@ -290,20 +318,23 @@ class ResidualNet(nn.LayerList): ValueError If the length of dilations_h does not equals n_layers. """ - def __init__(self, - n_layer: int, - residual_channels: int, - condition_channels: int, - kernel_size: Tuple[int], + + def __init__(self, + n_layer: int, + residual_channels: int, + condition_channels: int, + kernel_size: Tuple[int], dilations_h: List[int]): if len(dilations_h) != n_layer: - raise ValueError("number of dilations_h should equals num of layers") + raise ValueError( + "number of dilations_h should equals num of layers") super(ResidualNet, self).__init__() for i in range(n_layer): - dilation = (dilations_h[i], 2 ** i) - layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation) + dilation = (dilations_h[i], 2**i) + layer = ResidualBlock(residual_channels, condition_channels, + kernel_size, dilation) self.append(layer) - + def forward(self, x, condition): """Comput the output of given the input and the condition. @@ -332,7 +363,7 @@ class ResidualNet(nn.LayerList): """ for layer in self: layer.start_sequence() - + def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffers. @@ -386,33 +417,37 @@ class Flow(nn.Layer): Number of timesteps to the folded into a group. """ dilations_dict = { - 8: [1, 1, 1, 1, 1, 1, 1, 1], - 16: [1, 1, 1, 1, 1, 1, 1, 1], - 32: [1, 2, 4, 1, 2, 4, 1, 2], - 64: [1, 2, 4, 8, 16, 1, 2, 4], - 128: [1, 2, 4, 8, 16, 32, 64, 1] + 8: [1, 1, 1, 1, 1, 1, 1, 1], + 16: [1, 1, 1, 1, 1, 1, 1, 1], + 32: [1, 2, 4, 1, 2, 4, 1, 2], + 64: [1, 2, 4, 8, 16, 1, 2, 4], + 128: [1, 2, 4, 8, 16, 32, 64, 1] } - + def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group): super(Flow, self).__init__() # input projection self.input_proj = nn.utils.weight_norm( - nn.Conv2D(1, channels, (1, 1), - weight_attr=I.Uniform(-1., 1.), - bias_attr=I.Uniform(-1., 1.))) - + nn.Conv2D( + 1, + channels, (1, 1), + weight_attr=I.Uniform(-1., 1.), + bias_attr=I.Uniform(-1., 1.))) + # residual net - self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, + self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, self.dilations_dict[n_group]) - + # output projection - self.output_proj = nn.Conv2D(channels, 2, (1, 1), - weight_attr=I.Constant(0.), - bias_attr=I.Constant(0.)) - + self.output_proj = nn.Conv2D( + channels, + 2, (1, 1), + weight_attr=I.Constant(0.), + bias_attr=I.Constant(0.)) + # specs self.n_group = n_group - + def _predict_parameters(self, x, condition): x = self.input_proj(x) x = self.resnet(x, condition) @@ -421,11 +456,11 @@ class Flow(nn.Layer): return logs, b def _transform(self, x, logs, b): - z_0 = x[:, :, :1, :] # the first row, just copy it - z_out = x[:, :, 1:, :] * paddle.exp(logs) + b + z_0 = x[:, :, :1, :] # the first row, just copy it + z_out = x[:, :, 1:, :] * paddle.exp(logs) + b z_out = paddle.concat([z_0, z_out], axis=2) return z_out - + def forward(self, x, condition): """Probability density estimation. It is done by inversely transform a sample from p(X) into a sample from p(Z). @@ -452,8 +487,8 @@ class Flow(nn.Layer): transformation from x to z. """ # (B, C, H-1, W) - logs, b = self._predict_parameters( - x[:, :, :-1, :], condition[:, :, 1:, :]) + logs, b = self._predict_parameters(x[:, :, :-1, :], + condition[:, :, 1:, :]) z = self._transform(x, logs, b) return z, (logs, b) @@ -467,7 +502,7 @@ class Flow(nn.Layer): def _inverse_transform_row(self, z_row, logs, b): x_row = (z_row - b) * paddle.exp(-logs) return x_row - + def _inverse_row(self, z_row, x_row, condition_row): logs, b = self._predict_row_parameters(x_row, condition_row) x_next_row = self._inverse_transform_row(z_row, logs, b) @@ -475,7 +510,7 @@ class Flow(nn.Layer): def _start_sequence(self): self.resnet.start_sequence() - + def inverse(self, z, condition): """Sampling from the the distrition p(X). It is done by sample form p(Z) and transform the sample. It is a auto regressive transformation. @@ -510,15 +545,16 @@ class Flow(nn.Layer): self._start_sequence() for i in range(1, self.n_group): - x_row = x[-1] # actuallt i-1:i - z_row = z[:, :, i:i+1, :] - condition_row = condition[:, :, i:i+1, :] + x_row = x[-1] # actuallt i-1:i + z_row = z[:, :, i:i + 1, :] + condition_row = condition[:, :, i:i + 1, :] - x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row) + x_next_row, (logs, b) = self._inverse_row(z_row, x_row, + condition_row) x.append(x_next_row) logs_list.append(logs) b_list.append(b) - + x = paddle.concat(x, 2) logs = paddle.concat(logs_list, 2) b = paddle.concat(b_list, 2) @@ -549,21 +585,25 @@ class WaveFlow(nn.LayerList): kernel_size : Union[int, List[int]] Kernel size of the convolution layer in each ResidualBlock. """ - def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size): + + def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, + kernel_size): if n_group % 2 or n_flows % 2: - raise ValueError("number of flows and number of group must be even " - "since a permutation along group among flows is used.") + raise ValueError( + "number of flows and number of group must be even " + "since a permutation along group among flows is used.") super(WaveFlow, self).__init__() for _ in range(n_flows): - self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group)) - + self.append( + Flow(n_layers, channels, mel_bands, kernel_size, n_group)) + # permutations in h self.perms = self._create_perm(n_group, n_flows) # specs self.n_group = n_group self.n_flows = n_flows - + def _create_perm(self, n_group, n_flows): indices = list(range(n_group)) half = n_group // 2 @@ -572,20 +612,21 @@ class WaveFlow(nn.LayerList): if i < n_flows // 2: perms.append(indices[::-1]) else: - perm = list(reversed(indices[:half])) + list(reversed(indices[half:])) + perm = list(reversed(indices[:half])) + list( + reversed(indices[half:])) perms.append(perm) return perms - + def _trim(self, x, condition): assert condition.shape[-1] >= x.shape[-1] pruned_len = int(x.shape[-1] // self.n_group * self.n_group) - + if x.shape[-1] > pruned_len: x = x[:, :pruned_len] if condition.shape[-1] > pruned_len: condition = condition[:, :, :pruned_len] return x, condition - + def forward(self, x, condition): """Probability density estimation of random variable x given the condition. @@ -610,21 +651,23 @@ class WaveFlow(nn.LayerList): # x: (B, T) # condition: (B, C, T) upsampled condition x, condition = self._trim(x, condition) - + # to (B, C, h, T//h) layout - x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1) - condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) - + x = paddle.unsqueeze( + paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1) + condition = paddle.transpose( + fold(condition, self.n_group), [0, 1, 3, 2]) + # flows logs_list = [] for i, layer in enumerate(self): - x, (logs, b) = layer(x, condition) + x, (logs, b) = layer(x, condition) logs_list.append(logs) # permute paddle has no shuffle dim x = geo.shuffle_dim(x, 2, perm=self.perms[i]) condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) - z = paddle.squeeze(x, 1) # (B, H, W) + z = paddle.squeeze(x, 1) # (B, H, W) batch_size = z.shape[0] z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1]) @@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList): z, condition = self._trim(z, condition) # to (B, C, h, T//h) layout - z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1) - condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) + z = paddle.unsqueeze( + paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1) + condition = paddle.transpose( + fold(condition, self.n_group), [0, 1, 3, 2]) # reverse it flow by flow for i in reversed(range(self.n_flows)): @@ -663,7 +708,7 @@ class WaveFlow(nn.LayerList): condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) z, (logs, b) = self[i].inverse(z, condition) - x = paddle.squeeze(z, 1) # (B, H, W) + x = paddle.squeeze(z, 1) # (B, H, W) batch_size = x.shape[0] x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1]) return x @@ -695,23 +740,24 @@ class ConditionalWaveFlow(nn.LayerList): kernel_size : Union[int, List[int]] Kernel size of the convolution layer in each ResidualBlock. """ - def __init__(self, - upsample_factors: List[int], - n_flows: int, - n_layers: int, - n_group: int, - channels: int, - n_mels: int, - kernel_size: Union[int, List[int]]): + + def __init__(self, + upsample_factors: List[int], + n_flows: int, + n_layers: int, + n_group: int, + channels: int, + n_mels: int, + kernel_size: Union[int, List[int]]): super(ConditionalWaveFlow, self).__init__() self.encoder = UpsampleNet(upsample_factors) self.decoder = WaveFlow( - n_flows=n_flows, - n_layers=n_layers, - n_group=n_group, - channels=channels, - mel_bands=n_mels, - kernel_size=kernel_size) + n_flows=n_flows, + n_layers=n_layers, + n_group=n_group, + channels=channels, + mel_bands=n_mels, + kernel_size=kernel_size) def forward(self, audio, mel): """Compute the transformed random variable z (x to z) and the log of @@ -737,7 +783,7 @@ class ConditionalWaveFlow(nn.LayerList): condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) return z, log_det_jacobian - + @paddle.no_grad() def infer(self, mel): r"""Generate raw audio given mel spectrogram. @@ -752,12 +798,12 @@ class ConditionalWaveFlow(nn.LayerList): Tensor : [shape=(B, T)] The synthesized audio, where``T <= T_mel \* upsample_factors``. """ - condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) + condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) batch_size, _, time_steps = condition.shape z = paddle.randn([batch_size, time_steps], dtype=mel.dtype) x = self.decoder.inverse(z, condition) return x - + @paddle.no_grad() def predict(self, mel): """Generate raw audio given mel spectrogram. @@ -777,7 +823,7 @@ class ConditionalWaveFlow(nn.LayerList): audio = self.infer(mel) audio = audio[0].numpy() return audio - + @classmethod def from_pretrained(cls, config, checkpoint_path): """Build a ConditionalWaveFlow model from a pretrained model. @@ -795,14 +841,13 @@ class ConditionalWaveFlow(nn.LayerList): ConditionalWaveFlow The model built from pretrained result. """ - model = cls( - upsample_factors=config.model.upsample_factors, - n_flows=config.model.n_flows, - n_layers=config.model.n_layers, - n_group=config.model.n_group, - channels=config.model.channels, - n_mels=config.data.n_mels, - kernel_size=config.model.kernel_size) + model = cls(upsample_factors=config.model.upsample_factors, + n_flows=config.model.n_flows, + n_layers=config.model.n_layers, + n_group=config.model.n_group, + channels=config.model.channels, + n_mels=config.data.n_mels, + kernel_size=config.model.kernel_size) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) return model @@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer): The standard deviation of the gaussian noise used in WaveFlow, by default 1.0. """ + def __init__(self, sigma=1.0): super(WaveFlowLoss, self).__init__() self.sigma = sigma @@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer): Tensor [shape=(1,)] The loss. """ - loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian + loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma + ) - log_det_jacobian loss = loss / np.prod(z.shape) return loss + self.const diff --git a/parakeet/models/wavenet.py b/parakeet/models/wavenet.py index 8e6f272..5ff3435 100644 --- a/parakeet/models/wavenet.py +++ b/parakeet/models/wavenet.py @@ -18,7 +18,7 @@ from typing import Union, Sequence, List from tqdm import trange import numpy as np -import paddle +import paddle from paddle import nn from paddle.nn import functional as F import paddle.fluid.initializer as I @@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools __all__ = ["WaveNet", "ConditionalWaveNet"] + def crop(x, audio_start, audio_length): """Crop the upsampled condition to match audio_length. @@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList): --------- ``librosa.core.stft`` """ + def __init__(self, upscale_factors=[16, 16]): super(UpsampleNet, self).__init__() self.upscale_factors = list(upscale_factors) @@ -106,9 +108,11 @@ class UpsampleNet(nn.LayerList): for factor in self.upscale_factors: self.append( nn.utils.weight_norm( - nn.Conv2DTranspose(1, 1, - kernel_size=(3, 2 * factor), - stride=(1, factor), + nn.Conv2DTranspose( + 1, + 1, + kernel_size=(3, 2 * factor), + stride=(1, factor), padding=(1, factor // 2)))) def forward(self, x): @@ -159,29 +163,34 @@ class ResidualBlock(nn.Layer): dilation :int Dilation of the internal convolution cells. """ - def __init__(self, - residual_channels: int, - condition_dim: int, + + def __init__(self, + residual_channels: int, + condition_dim: int, filter_size: Union[int, Sequence[int]], dilation: int): - + super(ResidualBlock, self).__init__() dilated_channels = 2 * residual_channels # following clarinet's implementation, we do not have parametric residual # & skip connection. - _filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size + _filter_size = filter_size[0] if isinstance(filter_size, ( + list, tuple)) else filter_size std = math.sqrt(1 / (_filter_size * residual_channels)) - conv = Conv1dCell(residual_channels, - dilated_channels, - filter_size, - dilation=dilation, - weight_attr=I.Normal(scale=std)) + conv = Conv1dCell( + residual_channels, + dilated_channels, + filter_size, + dilation=dilation, + weight_attr=I.Normal(scale=std)) self.conv = nn.utils.weight_norm(conv) std = math.sqrt(1 / condition_dim) - condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), - weight_attr=I.Normal(scale=std)) + condition_proj = Conv1dCell( + condition_dim, + dilated_channels, (1, ), + weight_attr=I.Normal(scale=std)) self.condition_proj = nn.utils.weight_norm(condition_proj) self.filter_size = filter_size @@ -309,10 +318,11 @@ class ResidualNet(nn.LayerList): Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``. """ - def __init__(self, - n_stack: int, - n_loop: int, - residual_channels: int, + + def __init__(self, + n_stack: int, + n_loop: int, + residual_channels: int, condition_dim: int, filter_size: int): super(ResidualNet, self).__init__() @@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList): dilations = [2**i for i in range(n_loop)] * n_stack self.context_size = 1 + sum(dilations) for dilation in dilations: - self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation)) + self.append( + ResidualBlock(residual_channels, condition_dim, filter_size, + dilation)) def forward(self, x, condition=None): """Forward pass of ``ResidualNet``. @@ -345,7 +357,7 @@ class ResidualNet(nn.LayerList): skip_connections = skip else: skip_connections = paddle.scale(skip_connections + skip, - math.sqrt(0.5)) + math.sqrt(0.5)) return skip_connections def start_sequence(self): @@ -381,7 +393,7 @@ class ResidualNet(nn.LayerList): skip_connections = skip else: skip_connections = paddle.scale(skip_connections + skip, - math.sqrt(0.5)) + math.sqrt(0.5)) return skip_connections @@ -426,6 +438,7 @@ class WaveNet(nn.Layer): This is only used for computing loss when ``loss_type`` is "mog", If the predicted log scale is less than -9.0, it is clipped at -9.0. """ + def __init__(self, n_stack, n_loop, residual_channels, output_dim, condition_dim, filter_size, loss_type, log_scale_min): @@ -437,19 +450,24 @@ class WaveNet(nn.Layer): else: if (output_dim % 3 != 0): raise ValueError( - "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim)) - self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1) + "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}". + format(output_dim)) + self.embed = nn.utils.weight_norm( + nn.Linear(1, residual_channels), dim=1) self.resnet = ResidualNet(n_stack, n_loop, residual_channels, condition_dim, filter_size) self.context_size = self.resnet.context_size skip_channels = residual_channels # assume the same channel - self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) - self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) + self.proj1 = nn.utils.weight_norm( + nn.Linear(skip_channels, skip_channels), dim=1) + self.proj2 = nn.utils.weight_norm( + nn.Linear(skip_channels, skip_channels), dim=1) # if loss_type is softmax, output_dim is n_vocab of waveform magnitude. # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev) - self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1) + self.proj3 = nn.utils.weight_norm( + nn.Linear(skip_channels, output_dim), dim=1) self.loss_type = loss_type self.output_dim = output_dim @@ -781,26 +799,28 @@ class ConditionalWaveNet(nn.Layer): This is only used for computing loss when ``loss_type`` is "mog", If the predicted log scale is less than -9.0, it is clipped at -9.0. """ - def __init__(self, - upsample_factors: List[int], - n_stack: int, - n_loop: int, - residual_channels: int, + + def __init__(self, + upsample_factors: List[int], + n_stack: int, + n_loop: int, + residual_channels: int, output_dim: int, - n_mels: int, - filter_size: int=2, - loss_type: str="mog", + n_mels: int, + filter_size: int=2, + loss_type: str="mog", log_scale_min: float=-9.0): super(ConditionalWaveNet, self).__init__() self.encoder = UpsampleNet(upsample_factors) - self.decoder = WaveNet(n_stack=n_stack, - n_loop=n_loop, - residual_channels=residual_channels, - output_dim=output_dim, - condition_dim=n_mels, - filter_size=filter_size, - loss_type=loss_type, - log_scale_min=log_scale_min) + self.decoder = WaveNet( + n_stack=n_stack, + n_loop=n_loop, + residual_channels=residual_channels, + output_dim=output_dim, + condition_dim=n_mels, + filter_size=filter_size, + loss_type=loss_type, + log_scale_min=log_scale_min) def forward(self, audio, mel, audio_start): """Compute the output distribution given the mel spectrogram and the input(for teacher force training). @@ -895,11 +915,11 @@ class ConditionalWaveNet(nn.Layer): self.decoder.start_sequence() x_t = paddle.zeros((batch_size, ), dtype=mel.dtype) for i in trange(time_steps): - c_t = condition[:, :, i] # (B, C) - y_t = self.decoder.add_input(x_t, c_t) #(B, C) + c_t = condition[:, :, i] # (B, C) + y_t = self.decoder.add_input(x_t, c_t) #(B, C) y_t = paddle.unsqueeze(y_t, 1) - x_t = self.sample(y_t) # (B, 1) - x_t = paddle.squeeze(x_t, 1) #(B,) + x_t = self.sample(y_t) # (B, 1) + x_t = paddle.squeeze(x_t, 1) #(B,) samples.append(x_t) samples = paddle.stack(samples, -1) return samples @@ -943,16 +963,15 @@ class ConditionalWaveNet(nn.Layer): ConditionalWaveNet The model built from pretrained result. """ - model = cls( - upsample_factors=config.model.upsample_factors, - n_stack=config.model.n_stack, - n_loop=config.model.n_loop, - residual_channels=config.model.residual_channels, - output_dim=config.model.output_dim, - n_mels=config.data.n_mels, - filter_size=config.model.filter_size, - loss_type=config.model.loss_type, - log_scale_min=config.model.log_scale_min) + model = cls(upsample_factors=config.model.upsample_factors, + n_stack=config.model.n_stack, + n_loop=config.model.n_loop, + residual_channels=config.model.residual_channels, + output_dim=config.model.output_dim, + n_mels=config.data.n_mels, + filter_size=config.model.filter_size, + loss_type=config.model.loss_type, + log_scale_min=config.model.log_scale_min) layer_tools.summary(model) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) return model diff --git a/parakeet/modules/audio.py b/parakeet/modules/audio.py index ebcc6c6..03e42b0 100644 --- a/parakeet/modules/audio.py +++ b/parakeet/modules/audio.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle from paddle import nn from paddle.nn import functional as F from scipy import signal -import numpy as np +import numpy as np __all__ = ["quantize", "dequantize", "STFT"] @@ -86,6 +100,7 @@ class STFT(nn.Layer): Ony ``center`` and ``reflect`` padding is supported now. """ + def __init__(self, n_fft, hop_length, win_length, window="hanning"): super(STFT, self).__init__() self.hop_length = hop_length @@ -109,7 +124,8 @@ class STFT(nn.Layer): (self.n_bin, 1, 1, self.n_fft)) w = np.concatenate([w_real, w_imag], axis=0) - self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) + self.weight = paddle.cast( + paddle.to_tensor(w), paddle.get_default_dtype()) def forward(self, x): """Compute the stft transform. diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py index b57abf2..d984605 100644 --- a/parakeet/modules/conv.py +++ b/parakeet/modules/conv.py @@ -20,6 +20,7 @@ __all__ = [ "Conv1dBatchNorm", ] + class Conv1dCell(nn.Conv1D): """A subclass of Conv1D layer, which can be used in an autoregressive decoder like an RNN cell. @@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer): epsilon : [type], optional The epsilon of the BatchNorm1D layer, by default 1e-05 """ + def __init__(self, in_channels, out_channels, diff --git a/parakeet/modules/geometry.py b/parakeet/modules/geometry.py index ec96daf..05a5931 100644 --- a/parakeet/modules/geometry.py +++ b/parakeet/modules/geometry.py @@ -1,6 +1,21 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import paddle + def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. @@ -32,7 +47,7 @@ def shuffle_dim(x, axis, perm=None): perm = np.array(perm) else: perm = np.random.permutation(size) - + perm = paddle.to_tensor(perm) out = paddle.gather(x, perm, axis) return out diff --git a/parakeet/modules/losses.py b/parakeet/modules/losses.py index 3e22480..ab188fd 100644 --- a/parakeet/modules/losses.py +++ b/parakeet/modules/losses.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numba import numpy as np import paddle @@ -5,12 +19,13 @@ from paddle import nn from paddle.nn import functional as F __all__ = [ - "weighted_mean", - "masked_l1_loss", - "masked_softmax_with_cross_entropy", + "weighted_mean", + "masked_l1_loss", + "masked_softmax_with_cross_entropy", "diagonal_loss", ] + def weighted_mean(input, weight): """Weighted mean. It can also be used as masked mean. @@ -88,12 +103,11 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1): return loss -def diagonal_loss( - attentions, - input_lengths, - target_lengths, - g=0.2, - multihead=False): +def diagonal_loss(attentions, + input_lengths, + target_lengths, + g=0.2, + multihead=False): """A metric to evaluate how diagonal a attention distribution is. It is computed for batch attention distributions. For each attention @@ -133,6 +147,7 @@ def diagonal_loss( else: return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1)) + @numba.jit(nopython=True) def guided_attention(N, max_N, T, max_T, g): W = np.zeros((max_T, max_N), dtype=np.float32) @@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g): # (T_dec, T_enc) return W + def guided_attentions(input_lengths, target_lengths, g=0.2): B = len(input_lengths) max_input_len = input_lengths.max() @@ -151,4 +167,4 @@ def guided_attentions(input_lengths, target_lengths, g=0.2): W[b] = guided_attention(input_lengths[b], max_input_len, target_lengths[b], max_target_len, g) # (B, T_dec, T_enc) - return W \ No newline at end of file + return W diff --git a/parakeet/modules/masking.py b/parakeet/modules/masking.py index c54a5b1..96871a9 100644 --- a/parakeet/modules/masking.py +++ b/parakeet/modules/masking.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle from paddle.fluid.layers import sequence_mask @@ -8,6 +22,7 @@ __all__ = [ "future_mask", ] + def id_mask(input, padding_index=0, dtype="bool"): """Generate mask with input ids. diff --git a/parakeet/modules/positional_encoding.py b/parakeet/modules/positional_encoding.py index 084ccf3..07a86c9 100644 --- a/parakeet/modules/positional_encoding.py +++ b/parakeet/modules/positional_encoding.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math import numpy as np import paddle @@ -5,6 +19,7 @@ from paddle.nn import functional as F __all__ = ["positional_encoding"] + def positional_encoding(start_index, length, size, dtype=None): r"""Generate standard positional encoding matrix. @@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None): dtype = dtype or paddle.get_default_dtype() channel = np.arange(0, size, 2) index = np.arange(start_index, start_index + length, 1) - p = np.expand_dims(index, -1) / (10000 ** (channel / float(size))) + p = np.expand_dims(index, -1) / (10000**(channel / float(size))) encodings = np.zeros([length, size]) encodings[:, 0::2] = np.sin(p) encodings[:, 1::2] = np.cos(p) diff --git a/parakeet/modules/transformer.py b/parakeet/modules/transformer.py index 18a7523..e857990 100644 --- a/parakeet/modules/transformer.py +++ b/parakeet/modules/transformer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math import paddle from paddle import nn @@ -12,6 +26,7 @@ __all__ = [ "TransformerDecoderLayer", ] + class PositionwiseFFN(nn.Layer): """A faithful implementation of Position-wise Feed-Forward Network in `Attention is All You Need `_. @@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer): The probability of the Dropout applied to the output of the first layer, by default 0. """ - def __init__(self, - input_size: int, - hidden_size: int, - dropout=0.0): + + def __init__(self, input_size: int, hidden_size: int, dropout=0.0): super(PositionwiseFFN, self).__init__() self.linear1 = nn.Linear(input_size, hidden_size) self.linear2 = nn.Linear(hidden_size, input_size) @@ -86,16 +99,17 @@ class TransformerEncoderLayer(nn.Layer): ------ It uses the PostLN (post layer norm) scheme. """ + def __init__(self, d_model, n_heads, d_ffn, dropout=0.): super(TransformerEncoderLayer, self).__init__() self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.dropout = dropout - + def forward(self, x, mask): """Forward pass of TransformerEncoderLayer. @@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer): """ context_vector, attn_weights = self.self_mha(x, x, x, mask) x = self.layer_norm1( - F.dropout(x + context_vector, - self.dropout, - training=self.training)) - + F.dropout( + x + context_vector, self.dropout, training=self.training)) + x = self.layer_norm2( - F.dropout(x + self.ffn(x), - self.dropout, - training=self.training)) + F.dropout( + x + self.ffn(x), self.dropout, training=self.training)) return x, attn_weights @@ -155,19 +167,20 @@ class TransformerDecoderLayer(nn.Layer): ------ It uses the PostLN (post layer norm) scheme. """ + def __init__(self, d_model, n_heads, d_ffn, dropout=0.): super(TransformerDecoderLayer, self).__init__() self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6) - + self.dropout = dropout - + def forward(self, q, k, v, encoder_mask, decoder_mask): """Forward pass of TransformerEncoderLayer. @@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer): cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] Decoder-encoder cross attention. """ - context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) + context_vector, self_attn_weights = self.self_mha(q, q, q, + decoder_mask) q = self.layer_norm1( - F.dropout(q + context_vector, - self.dropout, - training=self.training)) - - context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask) + F.dropout( + q + context_vector, self.dropout, training=self.training)) + + context_vector, cross_attn_weights = self.cross_mha(q, k, v, + encoder_mask) q = self.layer_norm2( - F.dropout(q + context_vector, - self.dropout, - training=self.training)) - + F.dropout( + q + context_vector, self.dropout, training=self.training)) + q = self.layer_norm3( - F.dropout(q + self.ffn(q), - self.dropout, - training=self.training)) + F.dropout( + q + self.ffn(q), self.dropout, training=self.training)) return q, self_attn_weights, cross_attn_weights diff --git a/parakeet/training/__init__.py b/parakeet/training/__init__.py index cb1c59b..aec401c 100644 --- a/parakeet/training/__init__.py +++ b/parakeet/training/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.training.cli import * from parakeet.training.experiment import * diff --git a/parakeet/training/cli.py b/parakeet/training/cli.py index e6b6fe5..a3cfbda 100644 --- a/parakeet/training/cli.py +++ b/parakeet/training/cli.py @@ -1,5 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse + def default_argument_parser(): r"""A simple yet genral argument parser for experiments with parakeet. @@ -46,5 +61,5 @@ def default_argument_parser(): # overwrite extra config and default config parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") # yapd: enable - + return parser diff --git a/parakeet/training/default_config.py b/parakeet/training/default_config.py index f4b9c29..583f6e6 100644 --- a/parakeet/training/default_config.py +++ b/parakeet/training/default_config.py @@ -1,12 +1,26 @@ -from yacs.config import CfgNode +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from yacs.config import CfgNode _C = CfgNode( dict( - valid_interval=1000, # validation - save_interval=10000, # checkpoint - max_iteration=900000, # max iteration to train - ) -) + valid_interval=1000, # validation + save_interval=10000, # checkpoint + max_iteration=900000, # max iteration to train + )) + def get_default_training_config(): return _C.clone() diff --git a/parakeet/training/experiment.py b/parakeet/training/experiment.py index 1bf0af6..16da93d 100644 --- a/parakeet/training/experiment.py +++ b/parakeet/training/experiment.py @@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools __all__ = ["ExperimentBase"] + class ExperimentBase(object): """ An experiment template in order to structure the training code and take diff --git a/parakeet/utils/checkpoint.py b/parakeet/utils/checkpoint.py index ec6f282..0d2a2e2 100644 --- a/parakeet/utils/checkpoint.py +++ b/parakeet/utils/checkpoint.py @@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int: return iteration + def _save_checkpoint(checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpointed. @@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int): with open(checkpoint_record, "wt") as handle: handle.write("model_checkpoint_path: step-{}".format(iteration)) + def load_parameters(model, optimizer=None, checkpoint_dir=None, @@ -97,18 +99,19 @@ def load_parameters(model, params_path = checkpoint_path + ".pdparams" model_dict = paddle.load(params_path) model.set_state_dict(model_dict) - print("[checkpoint] Rank {}: loaded model from {}".format( - local_rank, params_path)) - + print("[checkpoint] Rank {}: loaded model from {}".format(local_rank, + params_path)) + optimizer_path = checkpoint_path + ".pdopt" if optimizer and os.path.isfile(optimizer_path): optimizer_dict = paddle.load(optimizer_path) optimizer.set_state_dict(optimizer_dict) - print("[checkpoint] Rank {}: loaded optimizer state from {}". - format(local_rank, optimizer_path)) + print("[checkpoint] Rank {}: loaded optimizer state from {}".format( + local_rank, optimizer_path)) return iteration + @mp_tools.rank_zero_only def save_parameters(checkpoint_dir, iteration, model, optimizer=None): """Checkpoint the latest trained model parameters. @@ -124,7 +127,7 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None): None """ checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration)) - + model_dict = model.state_dict() params_path = checkpoint_path + ".pdparams" paddle.save(model_dict, params_path) diff --git a/parakeet/utils/internals.py b/parakeet/utils/internals.py index c72a9b0..968a604 100644 --- a/parakeet/utils/internals.py +++ b/parakeet/utils/internals.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from paddle.framework import core diff --git a/parakeet/utils/layer_tools.py b/parakeet/utils/layer_tools.py index 2268377..fcda44f 100644 --- a/parakeet/utils/layer_tools.py +++ b/parakeet/utils/layer_tools.py @@ -28,6 +28,7 @@ def summary(layer: nn.Layer): print("layer has {} parameters, {} elements.".format(num_params, num_elements)) + def gradient_norm(layer: nn.Layer): grad_norm_dict = {} for name, param in layer.state_dict().items(): @@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer): grad_norm_dict[name] = np.linalg.norm(grad) / grad.size return grad_norm_dict + def recursively_remove_weight_norm(layer: nn.Layer): for layer in layer.sublayers(): try: @@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer): # ther is not weight norm hoom in this layer pass + def freeze(layer: nn.Layer): for param in layer.parameters(): param.trainable = False + def unfreeze(layer: nn.Layer): for param in layer.parameters(): param.trainable = True diff --git a/parakeet/utils/mp_tools.py b/parakeet/utils/mp_tools.py index 0b9c6dc..a4bc97a 100644 --- a/parakeet/utils/mp_tools.py +++ b/parakeet/utils/mp_tools.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle from paddle import distributed as dist from functools import wraps @@ -11,11 +25,8 @@ def rank_zero_only(func): @wraps(func) def wrapper(*args, **kwargs): if local_rank != 0: - return + return result = func(*args, **kwargs) return result - + return wrapper - - - diff --git a/parakeet/utils/scheduler.py b/parakeet/utils/scheduler.py index 97e98ec..4d41aca 100644 --- a/parakeet/utils/scheduler.py +++ b/parakeet/utils/scheduler.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"] @@ -24,7 +38,7 @@ class PieceWise(SchedulerBase): self.xs = [item[0] for item in anchors] self.ys = [item[1] for item in anchors] self.num_anchors = len(self.xs) - + def __call__(self, step): i = 0 for x in self.xs: @@ -34,8 +48,8 @@ class PieceWise(SchedulerBase): return self.ys[0] if i == self.num_anchors: return self.ys[-1] - k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1]) - out = self.ys[i-1] + (step - self.xs[i-1]) * k + k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1]) + out = self.ys[i - 1] + (step - self.xs[i - 1]) * k return out @@ -47,7 +61,7 @@ class StepWise(SchedulerBase): self.xs = [item[0] for item in anchors] self.ys = [item[1] for item in anchors] self.num_anchors = len(self.xs) - + def __call__(self, step): i = 0 for x in self.xs: @@ -58,5 +72,4 @@ class StepWise(SchedulerBase): return self.ys[-1] if i == 0: return self.ys[0] - return self.ys[i-1] - + return self.ys[i - 1] diff --git a/setup.py b/setup.py index ee5f215..0fa9eb7 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,6 @@ setup_info = dict( description='Speech synthesis tools and models based on Paddlepaddle', long_description=long_description, license='Apache 2', - python_requires='>=3.6', install_requires=[ 'numpy', @@ -71,23 +70,18 @@ setup_info = dict( 'yacs', 'tensorboardX', ], - extras_require={ - 'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], - }, + extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], }, # Package info packages=find_packages(exclude=('tests', 'tests.*')), - zip_safe=True, - - classifiers = [ + zip_safe=True, + classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Topic :: Scientific/Engineering :: Artificial Intelligence' 'License :: OSI Approved :: Apache2 License', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', - ], - - ) + ], ) setup(**setup_info)