Merge pull request #66 from iclementine/reborn

format code and discard opencc
This commit is contained in:
Feiyu Chan 2020-12-20 13:53:31 +08:00 committed by GitHub
commit fe7ddc2aaf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
72 changed files with 1258 additions and 1571 deletions

View File

@ -228,6 +228,6 @@ Parakeet 同时提供了示例模型的训练好的参数,可从下表中获
正在开发中。 正在开发中。
## 版权和许可 ## 版权和许可
Parakeet 以 [Apache-2.0 license](LICENSE) 提供。 Parakeet 以 [Apache-2.0 license](LICENSE) 提供。

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder. # Configuration file for the Sphinx documentation builder.
# #
# This file only contains a selection of the most common options. For a full # This file only contains a selection of the most common options. For a full
@ -14,7 +28,6 @@
# import sys # import sys
# sys.path.insert(0, os.path.abspath('.')) # sys.path.insert(0, os.path.abspath('.'))
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'parakeet' project = 'parakeet'
@ -24,7 +37,6 @@ author = 'parakeet-developers'
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = '0.2' release = '0.2'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be # Add any Sphinx extension module names here, as strings. They can be
@ -33,7 +45,7 @@ release = '0.2'
extensions = [ extensions = [
'sphinx.ext.autodoc', 'sphinx.ext.autodoc',
'sphinx.ext.viewcode', 'sphinx.ext.viewcode',
"sphinx_rtd_theme", "sphinx_rtd_theme",
'sphinx.ext.mathjax', 'sphinx.ext.mathjax',
'numpydoc', 'numpydoc',
] ]
@ -46,7 +58,6 @@ templates_path = ['_templates']
# This pattern also affects html_static_path and html_extra_path. # This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [] exclude_patterns = []
# -- Options for HTML output ------------------------------------------------- # -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for

View File

@ -18,7 +18,7 @@
常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。 常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
`ini` `ini`
优点:简单,支持字符串插值等操作。 优点:简单,支持字符串插值等操作。
缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。 缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。
@ -102,11 +102,3 @@ optional arguments:
--opts ... options to overwrite --config file and the default --opts ... options to overwrite --config file and the default
config, passing in KEY VALUE pairs config, passing in KEY VALUE pairs
``` ```

View File

@ -21,7 +21,7 @@
一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。 一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。
parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset. parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset.
1. 用于字段组合的有 TupleDataset, DictDataset; 1. 用于字段组合的有 TupleDataset, DictDataset;
2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset; 2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset;
@ -137,7 +137,7 @@ class Transform(object):
self.processor = AudioProcessor( self.processor = AudioProcessor(
sample_rate=22050, sample_rate=22050,
n_fft=1024, n_fft=1024,
win_length=1024, win_length=1024,
hop_length=256, hop_length=256,
f_max=8000) f_max=8000)
self.normalizer = LogMagnitude() self.normalizer = LogMagnitude()
@ -167,7 +167,7 @@ ljspeech = TransformDataset(meta, transform)
当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。 当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。
接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding. 接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding.
```python ```python
class LJSpeechCollector(object): class LJSpeechCollector(object):
@ -197,10 +197,10 @@ def create_dataloader(source_path, valid_size, batch_size):
valid_set, train_set = dataset.split(lj, valid_size) valid_set, train_set = dataset.split(lj, valid_size)
train_loader = DataLoader( train_loader = DataLoader(
train_set, train_set,
return_list=False, return_list=False,
batch_size=batch_size, batch_size=batch_size,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
collate_fn=LJSpeechCollector()) collate_fn=LJSpeechCollector())
valid_loader = DataLoader( valid_loader = DataLoader(

View File

@ -72,4 +72,4 @@ def train(self):
```python ```python
exp.run() exp.run()
``` ```

View File

@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+
``` ```
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。 在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。

View File

@ -31,7 +31,7 @@ python -m pip install paddlepaddle==2.0.0rc0 -i https://mirror.baidu.com/pypi/si
# ubuntu, debian # ubuntu, debian
sudo apt-get install libsndfile1 sudo apt-get install libsndfile1
# centos, fedora, # centos, fedora,
sudo yum install libsndfile sudo yum install libsndfile
# openSUSE # openSUSE

View File

@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了
1. 可复用的模型以及常用的模块; 1. 可复用的模型以及常用的模块;
2. 从数据处理,模型训练到预测等一系列过程的完整实验; 2. 从数据处理,模型训练到预测等一系列过程的完整实验;
3. 高质量的开箱即用模型。 3. 高质量的开箱即用模型。

View File

@ -1,21 +1,34 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN from yacs.config import CfgNode as CN
_C = CN() _C = CN()
_C.data = CN( _C.data = CN(
dict( dict(
batch_size=16, # batch size batch_size=16, # batch size
valid_size=64, # the first N examples are reserved for validation valid_size=64, # the first N examples are reserved for validation
sample_rate=22050, # Hz, sample rate sample_rate=22050, # Hz, sample rate
n_fft=1024, # fft frame size n_fft=1024, # fft frame size
win_length=1024, # window size win_length=1024, # window size
hop_length=256, # hop size between ajacent frame hop_length=256, # hop size between ajacent frame
f_max=8000, # Hz, max frequency when converting to mel f_max=8000, # Hz, max frequency when converting to mel
d_mel=80, # mel bands d_mel=80, # mel bands
padding_idx=0, # text embedding's padding index padding_idx=0, # text embedding's padding index
mel_start_value=0.5, # value for starting frame mel_start_value=0.5, # value for starting frame
mel_end_value=-0.5, # # value for ending frame mel_end_value=-0.5, # # value for ending frame
) ))
)
_C.model = CN( _C.model = CN(
dict( dict(
@ -31,22 +44,21 @@ _C.model = CN(
postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size
max_reduction_factor=10, # max_reduction factor max_reduction_factor=10, # max_reduction factor
dropout=0.1, # global droput probability dropout=0.1, # global droput probability
stop_loss_scale=8.0, # scaler for stop _loss stop_loss_scale=8.0, # scaler for stop _loss
decoder_prenet_dropout=0.5, # decoder prenet dropout probability decoder_prenet_dropout=0.5, # decoder prenet dropout probability
) ))
)
_C.training = CN( _C.training = CN(
dict( dict(
lr=1e-4, # learning rate lr=1e-4, # learning rate
drop_n_heads=[[0, 0], [15000, 1]], drop_n_heads=[[0, 0], [15000, 1]],
reduction_factor=[[0, 10], [80000, 4], [200000, 2]], reduction_factor=[[0, 10], [80000, 4], [200000, 2]],
plot_interval=1000, # plot attention and spectrogram plot_interval=1000, # plot attention and spectrogram
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=900000, # max iteration to train max_iteration=900000, # max iteration to train
) ))
)
def get_cfg_defaults(): def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project.""" """Get a yacs CfgNode object with default values for my_project."""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader
from parakeet.data.batch import batch_spec, batch_text_id from parakeet.data.batch import batch_spec, batch_text_id
from parakeet.data import dataset from parakeet.data import dataset
class LJSpeech(Dataset): class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset.""" """A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
records = [] records = []
@ -35,13 +51,13 @@ class Transform(object):
self.end_value = end_value self.end_value = end_value
def __call__(self, example): def __call__(self, example):
ids, mel = example # ids already have <s> and </s> ids, mel = example # ids already have <s> and </s>
ids = np.array(ids, dtype=np.int64) ids = np.array(ids, dtype=np.int64)
# add start and end frame # add start and end frame
mel = np.pad(mel, mel = np.pad(
[(0, 0), (1, 1)], mel, [(0, 0), (1, 1)],
mode='constant', mode='constant',
constant_values=[(0, 0), (self.start_value, self.end_value)]) constant_values=[(0, 0), (self.start_value, self.end_value)])
stop_labels = np.ones([mel.shape[1]], dtype=np.int64) stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
stop_labels[-1] = 2 stop_labels[-1] = 2
# actually this thing can also be done within the model # actually this thing can also be done within the model
@ -50,6 +66,7 @@ class Transform(object):
class LJSpeechCollector(object): class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples.""" """A simple callable to batch LJSpeech examples."""
def __init__(self, padding_idx=0, padding_value=0.): def __init__(self, padding_idx=0, padding_value=0.):
self.padding_idx = padding_idx self.padding_idx = padding_idx
self.padding_value = padding_value self.padding_value = padding_value
@ -67,15 +84,16 @@ class LJSpeechCollector(object):
def create_dataloader(config, source_path): def create_dataloader(config, source_path):
lj = LJSpeech(source_path) lj = LJSpeech(source_path)
transform = Transform(config.data.mel_start_value, config.data.mel_end_value) transform = Transform(config.data.mel_start_value,
config.data.mel_end_value)
lj = dataset.TransformDataset(lj, transform) lj = dataset.TransformDataset(lj, transform)
valid_set, train_set = dataset.split(lj, config.data.valid_size) valid_set, train_set = dataset.split(lj, config.data.valid_size)
data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx) data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx)
train_loader = DataLoader( train_loader = DataLoader(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
collate_fn=data_collator) collate_fn=data_collator)
valid_loader = DataLoader( valid_loader = DataLoader(
@ -85,4 +103,3 @@ def create_dataloader(config, source_path):
drop_last=False, drop_last=False,
collate_fn=data_collator) collate_fn=data_collator)
return train_loader, valid_loader return train_loader, valid_loader

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import tqdm import tqdm
import pickle import pickle
@ -11,6 +25,7 @@ from parakeet.frontend import English
from config import get_cfg_defaults from config import get_cfg_defaults
def create_dataset(config, source_path, target_path, verbose=False): def create_dataset(config, source_path, target_path, verbose=False):
# create output dir # create output dir
target_path = Path(target_path).expanduser() target_path = Path(target_path).expanduser()
@ -23,11 +38,11 @@ def create_dataset(config, source_path, target_path, verbose=False):
sample_rate=config.data.sample_rate, sample_rate=config.data.sample_rate,
n_fft=config.data.n_fft, n_fft=config.data.n_fft,
n_mels=config.data.d_mel, n_mels=config.data.d_mel,
win_length=config.data.win_length, win_length=config.data.win_length,
hop_length=config.data.hop_length, hop_length=config.data.hop_length,
f_max=config.data.f_max) f_max=config.data.f_max)
normalizer = LogMagnitude() normalizer = LogMagnitude()
records = [] records = []
for (fname, text, _) in tqdm.tqdm(meta_data): for (fname, text, _) in tqdm.tqdm(meta_data):
wav = processor.read_wav(fname) wav = processor.read_wav(fname)
@ -42,12 +57,13 @@ def create_dataset(config, source_path, target_path, verbose=False):
np.save(mel_path / mel_name, mel) np.save(mel_path / mel_name, mel)
if verbose: if verbose:
print("save mel spectrograms into {}".format(mel_path)) print("save mel spectrograms into {}".format(mel_path))
# save meta data as pickle archive # save meta data as pickle archive
with open(target_path / "metadata.pkl", 'wb') as f: with open(target_path / "metadata.pkl", 'wb') as f:
pickle.dump(records, f) pickle.dump(records, f)
if verbose: if verbose:
print("saved metadata into {}".format(target_path / "metadata.pkl")) print("saved metadata into {}".format(target_path /
"metadata.pkl"))
# also save meta data into text format for inspection # also save meta data into text format for inspection
with open(target_path / "metadata.txt", 'wt') as f: with open(target_path / "metadata.txt", 'wt') as f:
@ -55,21 +71,31 @@ def create_dataset(config, source_path, target_path, verbose=False):
phoneme_str = "|".join(phonemes) phoneme_str = "|".join(phonemes)
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str)) f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
if verbose: if verbose:
print("saved metadata into {}".format(target_path / "metadata.txt")) print("saved metadata into {}".format(target_path /
"metadata.txt"))
print("Done.") print("Done.")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset") parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") parser.add_argument(
parser.add_argument("--input", type=str, help="path of the ljspeech dataset") "--config",
parser.add_argument("--output", type=str, help="path to save output dataset") type=str,
parser.add_argument("--opts", nargs=argparse.REMAINDER, metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults() config = get_cfg_defaults()
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import time import time
from pathlib import Path from pathlib import Path
@ -13,21 +27,22 @@ from parakeet.utils.display import add_attention_plots
from config import get_cfg_defaults from config import get_cfg_defaults
@paddle.fluid.dygraph.no_grad @paddle.fluid.dygraph.no_grad
def main(config, args): def main(config, args):
paddle.set_device(args.device) paddle.set_device(args.device)
# model # model
frontend = English() frontend = English()
model = TransformerTTS.from_pretrained( model = TransformerTTS.from_pretrained(frontend, config,
frontend, config, args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# inputs # inputs
input_path = Path(args.input).expanduser() input_path = Path(args.input).expanduser()
with open(input_path, "rt") as f: with open(input_path, "rt") as f:
sentences = f.readlines() sentences = f.readlines()
output_dir = Path(args.output).expanduser() output_dir = Path(args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
@ -38,22 +53,36 @@ def main(config, args):
mel_output = mel_output.T #(C, T) mel_output = mel_output.T #(C, T)
np.save(str(output_dir / f"sentence_{i}"), mel_output) np.save(str(output_dir / f"sentence_{i}"), mel_output)
if args.verbose: if args.verbose:
print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy")) print("spectrogram saved at {}".format(output_dir /
f"sentence_{i}.npy"))
if __name__ == "__main__": if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") parser = argparse.ArgumentParser(
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument("--input", type=str, help="path of the text sentences") parser.add_argument("--input", type=str, help="path of the text sentences")
parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device", type=str, default="cpu", help="device type to use.") parser.add_argument(
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--device", type=str, default="cpu", help="device type to use.")
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
import logging import logging
from pathlib import Path from pathlib import Path
@ -19,12 +33,13 @@ from parakeet.training.experiment import ExperimentBase
from config import get_cfg_defaults from config import get_cfg_defaults
from ljspeech import LJSpeech, LJSpeechCollector, Transform from ljspeech import LJSpeech, LJSpeechCollector, Transform
class Experiment(ExperimentBase): class Experiment(ExperimentBase):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
frontend = English() frontend = English()
model = TransformerTTS( model = TransformerTTS(
frontend, frontend,
d_encoder=config.model.d_encoder, d_encoder=config.model.d_encoder,
d_decoder=config.model.d_decoder, d_decoder=config.model.d_decoder,
d_mel=config.data.d_mel, d_mel=config.data.d_mel,
@ -46,8 +61,7 @@ class Experiment(ExperimentBase):
beta1=0.9, beta1=0.9,
beta2=0.98, beta2=0.98,
epsilon=1e-9, epsilon=1e-9,
parameters=model.parameters() parameters=model.parameters())
)
criterion = TransformerTTSLoss(config.model.stop_loss_scale) criterion = TransformerTTSLoss(config.model.stop_loss_scale)
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
reduction_factor = scheduler.StepWise(config.training.reduction_factor) reduction_factor = scheduler.StepWise(config.training.reduction_factor)
@ -63,21 +77,24 @@ class Experiment(ExperimentBase):
config = self.config config = self.config
ljspeech_dataset = LJSpeech(args.data) ljspeech_dataset = LJSpeech(args.data)
transform = Transform(config.data.mel_start_value, config.data.mel_end_value) transform = Transform(config.data.mel_start_value,
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform) config.data.mel_end_value)
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
transform)
valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
if not self.parallel: if not self.parallel:
train_loader = DataLoader( train_loader = DataLoader(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
collate_fn=batch_fn) collate_fn=batch_fn)
else: else:
sampler = DistributedBatchSampler( sampler = DistributedBatchSampler(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
num_replicas=dist.get_world_size(), num_replicas=dist.get_world_size(),
rank=dist.get_rank(), rank=dist.get_rank(),
@ -95,11 +112,11 @@ class Experiment(ExperimentBase):
def compute_outputs(self, text, mel, stop_label): def compute_outputs(self, text, mel, stop_label):
model_core = self.model._layers if self.parallel else self.model model_core = self.model._layers if self.parallel else self.model
model_core.set_constants( model_core.set_constants(
self.reduction_factor(self.iteration), self.reduction_factor(self.iteration),
self.drop_n_heads(self.iteration)) self.drop_n_heads(self.iteration))
# TODO(chenfeiyu): we can combine these 2 slices # TODO(chenfeiyu): we can combine these 2 slices
mel_input = mel[:,:-1, :] mel_input = mel[:, :-1, :]
reduced_mel_input = mel_input[:, ::model_core.r, :] reduced_mel_input = mel_input[:, ::model_core.r, :]
outputs = self.model(text, reduced_mel_input) outputs = self.model(text, reduced_mel_input)
return outputs return outputs
@ -115,11 +132,8 @@ class Experiment(ExperimentBase):
time_steps = mel_target.shape[1] time_steps = mel_target.shape[1]
losses = self.criterion( losses = self.criterion(
mel_output[:,:time_steps, :], mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
mel_intermediate[:,:time_steps, :], mel_target, stop_logits[:, :time_steps, :], stop_label_target)
mel_target,
stop_logits[:,:time_steps, :],
stop_label_target)
return losses return losses
def train_batch(self): def train_batch(self):
@ -133,7 +147,7 @@ class Experiment(ExperimentBase):
outputs = self.compute_outputs(text, mel, stop_label) outputs = self.compute_outputs(text, mel, stop_label)
losses = self.compute_losses(batch, outputs) losses = self.compute_losses(batch, outputs)
loss = losses["loss"] loss = losses["loss"]
loss.backward() loss.backward()
self.optimizer.step() self.optimizer.step()
iteration_time = time.time() - start iteration_time = time.time() - start
@ -141,14 +155,17 @@ class Experiment(ExperimentBase):
# logging # logging
msg = "Rank: {}, ".format(dist.get_rank()) msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) iteration_time)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
self.logger.info(msg) self.logger.info(msg)
if dist.get_rank() == 0: if dist.get_rank() == 0:
for k, v in losses_np.items(): for k, v in losses_np.items():
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) self.visualizer.add_scalar(f"train_loss/{k}", v,
self.iteration)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
def valid(self): def valid(self):
@ -163,10 +180,9 @@ class Experiment(ExperimentBase):
if i < 2: if i < 2:
attention_weights = outputs["cross_attention_weights"] attention_weights = outputs["cross_attention_weights"]
display.add_multi_attention_plots( display.add_multi_attention_plots(
self.visualizer, self.visualizer,
f"valid_sentence_{i}_cross_attention_weights", f"valid_sentence_{i}_cross_attention_weights",
attention_weights, attention_weights, self.iteration)
self.iteration)
# write visual log # write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
@ -191,7 +207,7 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = default_argument_parser() parser = default_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)

View File

@ -1,40 +1,52 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN from yacs.config import CfgNode as CN
_C = CN() _C = CN()
_C.data = CN( _C.data = CN(
dict( dict(
batch_size=8, # batch size batch_size=8, # batch size
valid_size=16, # the first N examples are reserved for validation valid_size=16, # the first N examples are reserved for validation
sample_rate=22050, # Hz, sample rate sample_rate=22050, # Hz, sample rate
n_fft=1024, # fft frame size n_fft=1024, # fft frame size
win_length=1024, # window size win_length=1024, # window size
hop_length=256, # hop size between ajacent frame hop_length=256, # hop size between ajacent frame
f_max=8000, # Hz, max frequency when converting to mel f_max=8000, # Hz, max frequency when converting to mel
n_mels=80, # mel bands n_mels=80, # mel bands
clip_frames=65, # mel clip frames clip_frames=65, # mel clip frames
) ))
)
_C.model = CN( _C.model = CN(
dict( dict(
upsample_factors=[16, 16], upsample_factors=[16, 16],
n_flows=8, # number of flows in WaveFlow n_flows=8, # number of flows in WaveFlow
n_layers=8, # number of conv block in each flow n_layers=8, # number of conv block in each flow
n_group=16, # folding factor of audio and spectrogram n_group=16, # folding factor of audio and spectrogram
channels=128, # resiaudal channel in each flow channels=128, # resiaudal channel in each flow
kernel_size=[3, 3], # kernel size in each conv block kernel_size=[3, 3], # kernel size in each conv block
sigma=1.0, # stddev of the random noise sigma=1.0, # stddev of the random noise
) ))
)
_C.training = CN( _C.training = CN(
dict( dict(
lr=2e-4, # learning rates lr=2e-4, # learning rates
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=3000000, # max iteration to train max_iteration=3000000, # max iteration to train
) ))
)
def get_cfg_defaults(): def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project.""" """Get a yacs CfgNode object with default values for my_project."""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
from parakeet.data import dataset from parakeet.data import dataset
from parakeet.audio import AudioProcessor from parakeet.audio import AudioProcessor
class LJSpeech(Dataset): class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset.""" """A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
meta_data = pandas.read_csv( meta_data = pandas.read_csv(
str(self.root / "metadata.csv"), str(self.root / "metadata.csv"),
sep="\t", sep="\t",
header=None, header=None,
names=["fname", "frames", "samples"] names=["fname", "frames", "samples"])
)
records = [] records = []
for row in meta_data.itertuples() : for row in meta_data.itertuples():
mel_path = str(self.root / "mel" / (row.fname + ".npy")) mel_path = str(self.root / "mel" / (row.fname + ".npy"))
wav_path = str(self.root / "wav" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy"))
records.append((mel_path, wav_path)) records.append((mel_path, wav_path))
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object): class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples.""" """A simple callable to batch LJSpeech examples."""
def __init__(self, padding_value=0.): def __init__(self, padding_value=0.):
self.padding_value = padding_value self.padding_value = padding_value
@ -52,9 +68,9 @@ class LJSpeechCollector(object):
class LJSpeechClipCollector(object): class LJSpeechClipCollector(object):
def __init__(self, clip_frames=65, hop_length=256): def __init__(self, clip_frames=65, hop_length=256):
self.clip_frames = clip_frames self.clip_frames = clip_frames
self.hop_length = hop_length self.hop_length = hop_length
def __call__(self, examples): def __call__(self, examples):
mels = [] mels = []
wavs = [] wavs = []
@ -70,9 +86,7 @@ class LJSpeechClipCollector(object):
mel, wav = example mel, wav = example
frames = mel.shape[-1] frames = mel.shape[-1]
start = np.random.randint(0, frames - self.clip_frames) start = np.random.randint(0, frames - self.clip_frames)
mel_clip = mel[:, start: start + self.clip_frames] mel_clip = mel[:, start:start + self.clip_frames]
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
self.hop_length]
return mel_clip, wav_clip return mel_clip, wav_clip

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import tqdm import tqdm
import csv import csv
@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
transform = Transform( transform = Transform(config.sample_rate, config.n_fft, config.win_length,
config.sample_rate, config.hop_length, config.n_mels)
config.n_fft,
config.win_length,
config.hop_length,
config.n_mels)
file_names = [] file_names = []
for example in tqdm.tqdm(dataset): for example in tqdm.tqdm(dataset):
@ -107,23 +117,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
np.save(str(mel_dir / base_name), mel) np.save(str(mel_dir / base_name), mel)
file_names.append((base_name, mel.shape[-1], audio.shape[-1])) file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
meta_data = pd.DataFrame.from_records(file_names) meta_data = pd.DataFrame.from_records(file_names)
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) meta_data.to_csv(
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
print("saved meta data in to {}".format(
os.path.join(output_dir, "metadata.csv")))
print("Done!") print("Done!")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset") parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") parser.add_argument(
parser.add_argument("--input", type=str, help="path of the ljspeech dataset") "--config",
parser.add_argument("--output", type=str, help="path to save output dataset") type=str,
parser.add_argument("--opts", nargs=argparse.REMAINDER, metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults() config = get_cfg_defaults()
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
@ -8,9 +22,9 @@ import parakeet
from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
from parakeet.utils import layer_tools, checkpoint from parakeet.utils import layer_tools, checkpoint
from config import get_cfg_defaults from config import get_cfg_defaults
def main(config, args): def main(config, args):
paddle.set_device(args.device) paddle.set_device(args.device)
model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path) model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
@ -23,7 +37,8 @@ def main(config, args):
for file_path in mel_dir.iterdir(): for file_path in mel_dir.iterdir():
mel = np.load(str(file_path)) mel = np.load(str(file_path))
audio = model.predict(mel) audio = model.predict(mel)
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") audio_path = output_dir / (
os.path.splitext(file_path.name)[0] + ".wav")
sf.write(audio_path, audio, config.data.sample_rate) sf.write(audio_path, audio, config.data.sample_rate)
print("[synthesize] {} -> {}".format(file_path, audio_path)) print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -31,17 +46,32 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") parser = argparse.ArgumentParser(
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument(
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") "--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument(
"--input",
type=str,
help="path of directory containing mel spectrogram (in .npy format)")
parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device", type=str, default="cpu", help="device type to use.") parser.add_argument(
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--device", type=str, default="cpu", help="device type to use.")
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
@ -49,4 +79,4 @@ if __name__ == "__main__":
print(config) print(config)
print(args) print(args)
main(config, args) main(config, args)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -34,7 +48,8 @@ class Experiment(ExperimentBase):
if self.parallel > 1: if self.parallel > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters()) optimizer = paddle.optimizer.Adam(
config.training.lr, parameters=model.parameters())
criterion = WaveFlowLoss(sigma=config.model.sigma) criterion = WaveFlowLoss(sigma=config.model.sigma)
self.model = model self.model = model
@ -46,20 +61,22 @@ class Experiment(ExperimentBase):
args = self.args args = self.args
ljspeech_dataset = LJSpeech(args.data) ljspeech_dataset = LJSpeech(args.data)
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
batch_fn = LJSpeechClipCollector(config.data.clip_frames,
config.data.hop_length)
batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length)
if not self.parallel: if not self.parallel:
train_loader = DataLoader( train_loader = DataLoader(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
collate_fn=batch_fn) collate_fn=batch_fn)
else: else:
sampler = DistributedBatchSampler( sampler = DistributedBatchSampler(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
num_replicas=dist.get_world_size(), num_replicas=dist.get_world_size(),
rank=dist.get_rank(), rank=dist.get_rank(),
@ -71,7 +88,7 @@ class Experiment(ExperimentBase):
valid_batch_fn = LJSpeechCollector() valid_batch_fn = LJSpeechCollector()
valid_loader = DataLoader( valid_loader = DataLoader(
valid_set, batch_size=1, collate_fn=valid_batch_fn) valid_set, batch_size=1, collate_fn=valid_batch_fn)
self.train_loader = train_loader self.train_loader = train_loader
self.valid_loader = valid_loader self.valid_loader = valid_loader
@ -90,17 +107,19 @@ class Experiment(ExperimentBase):
mel, wav = batch mel, wav = batch
z, log_det_jocobian = self.compute_outputs(mel, wav) z, log_det_jocobian = self.compute_outputs(mel, wav)
loss = self.criterion(z, log_det_jocobian) loss = self.criterion(z, log_det_jocobian)
loss.backward() loss.backward()
self.optimizer.step() self.optimizer.step()
iteration_time = time.time() - start iteration_time = time.time() - start
loss_value = float(loss) loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank()) msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += "loss: {:>.6f}".format(loss_value) msg += "loss: {:>.6f}".format(loss_value)
self.logger.info(msg) self.logger.info(msg)
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) self.visualizer.add_scalar(
"train/loss", loss_value, global_step=self.iteration)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
@ -112,7 +131,8 @@ class Experiment(ExperimentBase):
loss = self.criterion(z, log_det_jocobian) loss = self.criterion(z, log_det_jocobian)
valid_losses.append(float(loss)) valid_losses.append(float(loss))
valid_loss = np.mean(valid_losses) valid_loss = np.mean(valid_losses)
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) self.visualizer.add_scalar(
"valid/loss", valid_loss, global_step=self.iteration)
def main_sp(config, args): def main_sp(config, args):
@ -132,7 +152,7 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = default_argument_parser() parser = default_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)

View File

@ -1,19 +1,32 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN from yacs.config import CfgNode as CN
_C = CN() _C = CN()
_C.data = CN( _C.data = CN(
dict( dict(
batch_size=8, # batch size batch_size=8, # batch size
valid_size=16, # the first N examples are reserved for validation valid_size=16, # the first N examples are reserved for validation
sample_rate=22050, # Hz, sample rate sample_rate=22050, # Hz, sample rate
n_fft=2048, # fft frame size n_fft=2048, # fft frame size
win_length=1024, # window size win_length=1024, # window size
hop_length=256, # hop size between ajacent frame hop_length=256, # hop size between ajacent frame
# f_max=8000, # Hz, max frequency when converting to mel # f_max=8000, # Hz, max frequency when converting to mel
n_mels=80, # mel bands n_mels=80, # mel bands
train_clip_seconds=0.5, # audio clip length(in seconds) train_clip_seconds=0.5, # audio clip length(in seconds)
) ))
)
_C.model = CN( _C.model = CN(
dict( dict(
@ -21,24 +34,22 @@ _C.model = CN(
n_stack=3, n_stack=3,
n_loop=10, n_loop=10,
filter_size=2, filter_size=2,
residual_channels=128, # resiaudal channel in each flow residual_channels=128, # resiaudal channel in each flow
loss_type="mog", loss_type="mog",
output_dim=3, # single gaussian output_dim=3, # single gaussian
log_scale_min=-9.0, log_scale_min=-9.0, ))
)
)
_C.training = CN( _C.training = CN(
dict( dict(
lr=1e-3, # learning rates lr=1e-3, # learning rates
anneal_rate=0.5, # learning rate decay rate anneal_rate=0.5, # learning rate decay rate
anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=3000000, # max iteration to train max_iteration=3000000, # max iteration to train
gradient_max_norm=100.0 # global norm of gradients gradient_max_norm=100.0 # global norm of gradients
) ))
)
def get_cfg_defaults(): def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project.""" """Get a yacs CfgNode object with default values for my_project."""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
from parakeet.data import dataset from parakeet.data import dataset
from parakeet.audio import AudioProcessor from parakeet.audio import AudioProcessor
class LJSpeech(Dataset): class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset.""" """A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
meta_data = pandas.read_csv( meta_data = pandas.read_csv(
str(self.root / "metadata.csv"), str(self.root / "metadata.csv"),
sep="\t", sep="\t",
header=None, header=None,
names=["fname", "frames", "samples"] names=["fname", "frames", "samples"])
)
records = [] records = []
for row in meta_data.itertuples() : for row in meta_data.itertuples():
mel_path = str(self.root / "mel" / (row.fname + ".npy")) mel_path = str(self.root / "mel" / (row.fname + ".npy"))
wav_path = str(self.root / "wav" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy"))
records.append((mel_path, wav_path)) records.append((mel_path, wav_path))
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object): class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples.""" """A simple callable to batch LJSpeech examples."""
def __init__(self, padding_value=0.): def __init__(self, padding_value=0.):
self.padding_value = padding_value self.padding_value = padding_value
@ -48,15 +64,15 @@ class LJSpeechCollector(object):
wavs = [example[1] for example in examples] wavs = [example[1] for example in examples]
mels = batch_spec(mels, pad_value=self.padding_value) mels = batch_spec(mels, pad_value=self.padding_value)
wavs = batch_wav(wavs, pad_value=self.padding_value) wavs = batch_wav(wavs, pad_value=self.padding_value)
audio_starts = np.zeros((batch_size,), dtype=np.int64) audio_starts = np.zeros((batch_size, ), dtype=np.int64)
return mels, wavs, audio_starts return mels, wavs, audio_starts
class LJSpeechClipCollector(object): class LJSpeechClipCollector(object):
def __init__(self, clip_frames=65, hop_length=256): def __init__(self, clip_frames=65, hop_length=256):
self.clip_frames = clip_frames self.clip_frames = clip_frames
self.hop_length = hop_length self.hop_length = hop_length
def __call__(self, examples): def __call__(self, examples):
mels = [] mels = []
wavs = [] wavs = []
@ -75,7 +91,8 @@ class LJSpeechClipCollector(object):
mel, wav = example mel, wav = example
frames = mel.shape[-1] frames = mel.shape[-1]
start = np.random.randint(0, frames - self.clip_frames) start = np.random.randint(0, frames - self.clip_frames)
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
self.hop_length]
return mel, wav_clip, start return mel, wav_clip, start
@ -132,7 +149,3 @@ class DataCollector(object):
audios = np.array(audios, dtype=np.float32) audios = np.array(audios, dtype=np.float32)
audio_starts = np.array(audio_starts, dtype=np.int64) audio_starts = np.array(audio_starts, dtype=np.int64)
return audios, mels, audio_starts return audios, mels, audio_starts

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import tqdm import tqdm
import csv import csv
@ -23,7 +37,7 @@ class Transform(object):
self.win_length = win_length self.win_length = win_length
self.hop_length = hop_length self.hop_length = hop_length
self.n_mels = n_mels self.n_mels = n_mels
self.spec_normalizer = UnitMagnitude(min=1e-5) self.spec_normalizer = UnitMagnitude(min=1e-5)
def __call__(self, example): def __call__(self, example):
@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
transform = Transform( transform = Transform(config.sample_rate, config.n_fft, config.win_length,
config.sample_rate, config.hop_length, config.n_mels)
config.n_fft,
config.win_length,
config.hop_length,
config.n_mels)
file_names = [] file_names = []
for example in tqdm.tqdm(dataset): for example in tqdm.tqdm(dataset):
@ -108,23 +118,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
np.save(str(mel_dir / base_name), mel) np.save(str(mel_dir / base_name), mel)
file_names.append((base_name, mel.shape[-1], audio.shape[-1])) file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
meta_data = pd.DataFrame.from_records(file_names) meta_data = pd.DataFrame.from_records(file_names)
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) meta_data.to_csv(
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
print("saved meta data in to {}".format(
os.path.join(output_dir, "metadata.csv")))
print("Done!") print("Done!")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset") parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") parser.add_argument(
parser.add_argument("--input", type=str, help="path of the ljspeech dataset") "--config",
parser.add_argument("--output", type=str, help="path to save output dataset") type=str,
parser.add_argument("--opts", nargs=argparse.REMAINDER, metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults() config = get_cfg_defaults()
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint
from config import get_cfg_defaults from config import get_cfg_defaults
def main(config, args): def main(config, args):
paddle.set_device(args.device) paddle.set_device(args.device)
model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path) model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
@ -22,7 +37,8 @@ def main(config, args):
for file_path in mel_dir.iterdir(): for file_path in mel_dir.iterdir():
mel = np.load(str(file_path)) mel = np.load(str(file_path))
audio = model.predict(mel) audio = model.predict(mel)
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") audio_path = output_dir / (
os.path.splitext(file_path.name)[0] + ".wav")
sf.write(audio_path, audio, config.data.sample_rate) sf.write(audio_path, audio, config.data.sample_rate)
print("[synthesize] {} -> {}".format(file_path, audio_path)) print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -30,17 +46,32 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") parser = argparse.ArgumentParser(
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument(
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") "--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument(
"--input",
type=str,
help="path of directory containing mel spectrogram (in .npy format)")
parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device", type=str, default="cpu", help="device type to use.") parser.add_argument(
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--device", type=str, default="cpu", help="device type to use.")
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
@ -48,4 +79,4 @@ if __name__ == "__main__":
print(config) print(config)
print(args) print(args)
main(config, args) main(config, args)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
from pathlib import Path from pathlib import Path
import math import math
@ -26,7 +40,7 @@ class Experiment(ExperimentBase):
config = self.config config = self.config
model = ConditionalWaveNet( model = ConditionalWaveNet(
upsample_factors=config.model.upsample_factors, upsample_factors=config.model.upsample_factors,
n_stack=config.model.n_stack, n_stack=config.model.n_stack,
n_loop=config.model.n_loop, n_loop=config.model.n_loop,
residual_channels=config.model.residual_channels, residual_channels=config.model.residual_channels,
output_dim=config.model.output_dim, output_dim=config.model.output_dim,
@ -39,13 +53,13 @@ class Experiment(ExperimentBase):
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.StepDecay( lr_scheduler = paddle.optimizer.lr.StepDecay(
config.training.lr, config.training.lr, config.training.anneal_interval,
config.training.anneal_interval,
config.training.anneal_rate) config.training.anneal_rate)
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
lr_scheduler, lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm)) grad_clip=paddle.nn.ClipGradByGlobalNorm(
config.training.gradient_max_norm))
self.model = model self.model = model
self.model_core = model._layer if self.parallel else model self.model_core = model._layer if self.parallel else model
@ -56,7 +70,8 @@ class Experiment(ExperimentBase):
args = self.args args = self.args
ljspeech_dataset = LJSpeech(args.data) ljspeech_dataset = LJSpeech(args.data)
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
# convolutional net's causal padding size # convolutional net's causal padding size
context_size = config.model.n_stack \ context_size = config.model.n_stack \
@ -66,20 +81,21 @@ class Experiment(ExperimentBase):
# frames used to compute loss # frames used to compute loss
frames_per_second = config.data.sample_rate // config.data.hop_length frames_per_second = config.data.sample_rate // config.data.hop_length
train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second) train_clip_frames = math.ceil(config.data.train_clip_seconds *
frames_per_second)
num_frames = train_clip_frames + context_frames num_frames = train_clip_frames + context_frames
batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length) batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
if not self.parallel: if not self.parallel:
train_loader = DataLoader( train_loader = DataLoader(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
collate_fn=batch_fn) collate_fn=batch_fn)
else: else:
sampler = DistributedBatchSampler( sampler = DistributedBatchSampler(
train_set, train_set,
batch_size=config.data.batch_size, batch_size=config.data.batch_size,
shuffle=True, shuffle=True,
drop_last=True) drop_last=True)
@ -89,7 +105,7 @@ class Experiment(ExperimentBase):
valid_batch_fn = LJSpeechCollector() valid_batch_fn = LJSpeechCollector()
valid_loader = DataLoader( valid_loader = DataLoader(
valid_set, batch_size=1, collate_fn=valid_batch_fn) valid_set, batch_size=1, collate_fn=valid_batch_fn)
self.train_loader = train_loader self.train_loader = train_loader
self.valid_loader = valid_loader self.valid_loader = valid_loader
@ -101,20 +117,22 @@ class Experiment(ExperimentBase):
self.model.train() self.model.train()
self.optimizer.clear_grad() self.optimizer.clear_grad()
mel, wav, audio_starts = batch mel, wav, audio_starts = batch
y = self.model(wav, mel, audio_starts) y = self.model(wav, mel, audio_starts)
loss = self.model.loss(y, wav) loss = self.model.loss(y, wav)
loss.backward() loss.backward()
self.optimizer.step() self.optimizer.step()
iteration_time = time.time() - start iteration_time = time.time() - start
loss_value = float(loss) loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank()) msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += "loss: {:>.6f}".format(loss_value) msg += "loss: {:>.6f}".format(loss_value)
self.logger.info(msg) self.logger.info(msg)
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) self.visualizer.add_scalar(
"train/loss", loss_value, global_step=self.iteration)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
@ -126,7 +144,8 @@ class Experiment(ExperimentBase):
loss = self.model.loss(y, wav) loss = self.model.loss(y, wav)
valid_losses.append(float(loss)) valid_losses.append(float(loss))
valid_loss = np.mean(valid_losses) valid_loss = np.mean(valid_losses)
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) self.visualizer.add_scalar(
"valid/loss", valid_loss, global_step=self.iteration)
def main_sp(config, args): def main_sp(config, args):
@ -146,7 +165,7 @@ if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = default_argument_parser() parser = default_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:
config.merge_from_file(args.config) config.merge_from_file(args.config)
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)

View File

@ -18,15 +18,16 @@ import numpy as np
__all__ = ["AudioProcessor"] __all__ = ["AudioProcessor"]
class AudioProcessor(object): class AudioProcessor(object):
def __init__(self, def __init__(self,
sample_rate:int, sample_rate: int,
n_fft:int, n_fft: int,
win_length:int, win_length: int,
hop_length:int, hop_length: int,
n_mels:int=80, n_mels: int=80,
f_min:int=0, f_min: int=0,
f_max:int=None, f_max: int=None,
window="hann", window="hann",
center=True, center=True,
pad_mode="reflect"): pad_mode="reflect"):
@ -40,7 +41,7 @@ class AudioProcessor(object):
self.window = window self.window = window
self.center = center self.center = center
self.pad_mode = pad_mode self.pad_mode = pad_mode
# mel # mel
self.n_mels = n_mels self.n_mels = n_mels
self.f_min = f_min self.f_min = f_min
@ -48,19 +49,18 @@ class AudioProcessor(object):
self.mel_filter = self._create_mel_filter() self.mel_filter = self._create_mel_filter()
self.inv_mel_filter = np.linalg.pinv(self.mel_filter) self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
def _create_mel_filter(self): def _create_mel_filter(self):
mel_filter = librosa.filters.mel( mel_filter = librosa.filters.mel(self.sample_rate,
self.sample_rate, self.n_fft,
self.n_fft, n_mels=self.n_mels,
n_mels=self.n_mels, fmin=self.f_min,
fmin=self.f_min, fmax=self.f_max)
fmax=self.f_max)
return mel_filter return mel_filter
def read_wav(self, filename): def read_wav(self, filename):
# resampling may occur # resampling may occur
wav, _ = librosa.load(filename, sr=self.sample_rate) wav, _ = librosa.load(filename, sr=self.sample_rate)
return wav return wav
def write_wav(self, path, wav): def write_wav(self, path, wav):
@ -69,7 +69,7 @@ class AudioProcessor(object):
def stft(self, wav): def stft(self, wav):
D = librosa.core.stft( D = librosa.core.stft(
wav, wav,
n_fft = self.n_fft, n_fft=self.n_fft,
hop_length=self.hop_length, hop_length=self.hop_length,
win_length=self.win_length, win_length=self.win_length,
window=self.window, window=self.window,

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
This modules contains normalizers for spectrogram magnitude. This modules contains normalizers for spectrogram magnitude.
@ -19,22 +32,24 @@ __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"]
class NormalizerBase(object): class NormalizerBase(object):
def transform(self, spec): def transform(self, spec):
raise NotImplementedError("transform must be implemented") raise NotImplementedError("transform must be implemented")
def inverse(self, normalized): def inverse(self, normalized):
raise NotImplementedError("inverse must be implemented") raise NotImplementedError("inverse must be implemented")
class LogMagnitude(NormalizerBase): class LogMagnitude(NormalizerBase):
""" """
This is a simple normalizer used in Waveglow, Waveflow, tacotron2... This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
""" """
def __init__(self, min=1e-7): def __init__(self, min=1e-7):
self.min = min self.min = min
def transform(self, x): def transform(self, x):
x = np.maximum(x, self.min) x = np.maximum(x, self.min)
x = np.log(x) x = np.log(x)
return x return x
def inverse(self, x): def inverse(self, x):
return np.exp(x) return np.exp(x)
@ -44,15 +59,16 @@ class UnitMagnitude(NormalizerBase):
""" """
This is the normalizer used in the This is the normalizer used in the
""" """
def __init__(self, min=1e-5): def __init__(self, min=1e-5):
self.min = min self.min = min
def transform(self, x): def transform(self, x):
db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20 db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20
normalized = (db_scale + 100) / 100 normalized = (db_scale + 100) / 100
clipped = np.clip(normalized, 0, 1) clipped = np.clip(normalized, 0, 1)
return clipped return clipped
def inverse(self, x): def inverse(self, x):
denormalized = np.clip(x, 0, 1) * 100 - 100 denormalized = np.clip(x, 0, 1) * 100 - 100
out = np.exp((denormalized + 20) / 20 * np.log(10)) out = np.exp((denormalized + 20) / 20 * np.log(10))

View File

@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided.
import numpy as np import numpy as np
__all__ = [ __all__ = [
"batch_text_id", "batch_wav", "batch_spec", "batch_text_id",
"TextIDBatcher", "WavBatcher", "SpecBatcher", "batch_wav",
"batch_spec",
"TextIDBatcher",
"WavBatcher",
"SpecBatcher",
] ]
class TextIDBatcher(object): class TextIDBatcher(object):
"""A wrapper class for `batch_text_id`.""" """A wrapper class for `batch_text_id`."""
@ -99,8 +104,8 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
pad_len = max_len - example.shape[-1] pad_len = max_len - example.shape[-1]
batch.append( batch.append(
np.pad(example, [(0, pad_len)], np.pad(example, [(0, pad_len)],
mode='constant', mode='constant',
constant_values=pad_value)) constant_values=pad_value))
return np.array(batch, dtype=dtype) return np.array(batch, dtype=dtype)
@ -113,7 +118,11 @@ class SpecBatcher(object):
self.time_major = time_major self.time_major = time_major
def __call__(self, minibatch): def __call__(self, minibatch):
out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype) out = batch_spec(
minibatch,
pad_value=self.pad_value,
time_major=self.time_major,
dtype=self.dtype)
return out return out
@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
""" """
# assume (F, T) or (T, F) # assume (F, T) or (T, F)
peek_example = minibatch[0] peek_example = minibatch[0]
assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram" assert len(
peek_example.shape) == 2, "we only handles mono channel spectrogram"
# assume (F, n_frame) or (n_frame, F) # assume (F, n_frame) or (n_frame, F)
time_idx = 0 if time_major else -1 time_idx = 0 if time_major else -1
@ -143,11 +153,11 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
if time_major: if time_major:
batch.append( batch.append(
np.pad(example, [(0, pad_len), (0, 0)], np.pad(example, [(0, pad_len), (0, 0)],
mode='constant', mode='constant',
constant_values=pad_value)) constant_values=pad_value))
else: else:
batch.append( batch.append(
np.pad(example, [(0, 0), (0, pad_len)], np.pad(example, [(0, 0), (0, pad_len)],
mode='constant', mode='constant',
constant_values=pad_value)) constant_values=pad_value))
return np.array(batch, dtype=dtype) return np.array(batch, dtype=dtype)

View File

@ -17,17 +17,25 @@ import paddle
from paddle.io import Dataset from paddle.io import Dataset
__all__ = [ __all__ = [
"split", "TransformDataset", "CacheDataset", "TupleDataset", "split",
"DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset", "TransformDataset",
"CacheDataset",
"TupleDataset",
"DictDataset",
"SliceDataset",
"SubsetDataset",
"FilterDataset",
"ChainDataset", "ChainDataset",
] ]
def split(dataset, first_size): def split(dataset, first_size):
"""A utility function to split a dataset into two datasets.""" """A utility function to split a dataset into two datasets."""
first = SliceDataset(dataset, 0, first_size) first = SliceDataset(dataset, 0, first_size)
second = SliceDataset(dataset, first_size, len(dataset)) second = SliceDataset(dataset, first_size, len(dataset))
return first, second return first, second
class TransformDataset(Dataset): class TransformDataset(Dataset):
def __init__(self, dataset, transform): def __init__(self, dataset, transform):
"""Dataset which is transformed from another with a transform. """Dataset which is transformed from another with a transform.
@ -141,7 +149,7 @@ class DictDataset(Dataset):
for i in six.moves.range(length)] for i in six.moves.range(length)]
else: else:
return batches return batches
def __len__(self): def __len__(self):
return self._length return self._length

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets.common import * from parakeet.datasets.common import *
from parakeet.datasets.ljspeech import * from parakeet.datasets.ljspeech import *

View File

@ -1,9 +1,24 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.io import Dataset from paddle.io import Dataset
import os import os
import librosa import librosa
__all__ = ["AudioFolderDataset"] __all__ = ["AudioFolderDataset"]
class AudioFolderDataset(Dataset): class AudioFolderDataset(Dataset):
def __init__(self, path, sample_rate, extension="wav"): def __init__(self, path, sample_rate, extension="wav"):
self.root = os.path.expanduser(path) self.root = os.path.expanduser(path)
@ -19,5 +34,5 @@ class AudioFolderDataset(Dataset):
def __getitem__(self, i): def __getitem__(self, i):
file_name = self.file_names[i] file_name = self.file_names[i]
y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable
return y return y

View File

@ -1,8 +1,23 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.io import Dataset from paddle.io import Dataset
from pathlib import Path from pathlib import Path
__all__ = ["LJSpeechMetaData"] __all__ = ["LJSpeechMetaData"]
class LJSpeechMetaData(Dataset): class LJSpeechMetaData(Dataset):
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset):
def __len__(self): def __len__(self):
return len(self.records) return len(self.records)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.vocab import * from parakeet.frontend.vocab import *
from parakeet.frontend.phonectic import * from parakeet.frontend.phonectic import *
from parakeet.frontend.punctuation import * from parakeet.frontend.punctuation import *

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.normalizer.normalizer import * from parakeet.frontend.normalizer.normalizer import *
from parakeet.frontend.normalizer.numbers import * from parakeet.frontend.normalizer.numbers import *

View File

@ -0,0 +1,14 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,14 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,8 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def full2half_width(ustr): def full2half_width(ustr):
half = [] half = []
for u in ustr: for u in ustr:
num = ord(u) num = ord(u)
if num == 0x3000: # 全角空格变半角 if num == 0x3000: # 全角空格变半角
num = 32 num = 32
elif 0xFF01 <= num <= 0xFF5E: elif 0xFF01 <= num <= 0xFF5E:
num -= 0xfee0 num -= 0xfee0
@ -10,15 +24,16 @@ def full2half_width(ustr):
half.append(u) half.append(u)
return ''.join(half) return ''.join(half)
def half2full_width(ustr): def half2full_width(ustr):
full = [] full = []
for u in ustr: for u in ustr:
num = ord(u) num = ord(u)
if num == 32: # 半角空格变全角 if num == 32: # 半角空格变全角
num = 0x3000 num = 0x3000
elif 0x21 <= num <= 0x7E: elif 0x21 <= num <= 0x7E:
num += 0xfee0 num += 0xfee0
u = chr(num) # to unicode u = chr(num) # to unicode
full.append(u) full.append(u)
return ''.join(full) return ''.join(full)

View File

@ -17,7 +17,8 @@ from typing import Union
from g2p_en import G2p from g2p_en import G2p
from g2pM import G2pM from g2pM import G2pM
from parakeet.frontend import Vocab from parakeet.frontend import Vocab
from opencc import OpenCC # discard opencc untill we find an easy solution to install it on windows
# from opencc import OpenCC
from parakeet.frontend.punctuation import get_punctuations from parakeet.frontend.punctuation import get_punctuations
from parakeet.frontend.normalizer.normalizer import normalize from parakeet.frontend.normalizer.normalizer import normalize
@ -211,7 +212,7 @@ class Chinese(Phonetics):
""" """
def __init__(self): def __init__(self):
self.opencc_backend = OpenCC('t2s.json') # self.opencc_backend = OpenCC('t2s.json')
self.backend = G2pM() self.backend = G2pM()
self.phonemes = self._get_all_syllables() self.phonemes = self._get_all_syllables()
self.punctuations = get_punctuations("cn") self.punctuations = get_punctuations("cn")
@ -236,7 +237,8 @@ class Chinese(Phonetics):
List[str] List[str]
The list of pronunciation sequence. The list of pronunciation sequence.
""" """
simplified = self.opencc_backend.convert(sentence) # simplified = self.opencc_backend.convert(sentence)
simplified = sentence
phonemes = self.backend(simplified) phonemes = self.backend(simplified)
start = self.vocab.start_symbol start = self.vocab.start_symbol
end = self.vocab.end_symbol end = self.vocab.end_symbol

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc import abc
import string import string
@ -13,15 +27,8 @@ EN_PUNCT = [
"!", "!",
] ]
CN_PUNCT = [ CN_PUNCT = ["", "", "", "", "", "", ""]
"",
"",
"",
"",
"",
"",
""
]
def get_punctuations(lang): def get_punctuations(lang):
if lang == "en": if lang == "en":
@ -30,4 +37,3 @@ def get_punctuations(lang):
return CN_PUNCT return CN_PUNCT
else: else:
raise ValueError(f"language {lang} Not supported") raise ValueError(f"language {lang} Not supported")

View File

@ -559,7 +559,7 @@ class TransformerTTS(nn.Layer):
@classmethod @classmethod
def from_pretrained(cls, frontend, config, checkpoint_path): def from_pretrained(cls, frontend, config, checkpoint_path):
model = TransformerTTS( model = TransformerTTS(
frontend, frontend,
d_encoder=config.model.d_encoder, d_encoder=config.model.d_encoder,
d_decoder=config.model.d_decoder, d_decoder=config.model.d_decoder,
d_mel=config.data.d_mel, d_mel=config.data.d_mel,
@ -575,11 +575,12 @@ class TransformerTTS(nn.Layer):
decoder_prenet_dropout=config.model.decoder_prenet_dropout, decoder_prenet_dropout=config.model.decoder_prenet_dropout,
dropout=config.model.dropout) dropout=config.model.dropout)
iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) iteration = checkpoint.load_parameters(
model, checkpoint_path=checkpoint_path)
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
reduction_factor = scheduler.StepWise(config.training.reduction_factor) reduction_factor = scheduler.StepWise(config.training.reduction_factor)
model.set_constants( model.set_constants(
reduction_factor=reduction_factor(iteration), reduction_factor=reduction_factor(iteration),
drop_n_heads=drop_n_heads(iteration)) drop_n_heads=drop_n_heads(iteration))
return model return model

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import numpy as np import numpy as np
from typing import List, Union, Tuple from typing import List, Union, Tuple
@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
def fold(x, n_group): def fold(x, n_group):
r"""Fold audio or spectrogram's temporal dimension in to groups. r"""Fold audio or spectrogram's temporal dimension in to groups.
@ -31,6 +46,7 @@ def fold(x, n_group):
new_shape = spatial_shape + [time_steps // n_group, n_group] new_shape = spatial_shape + [time_steps // n_group, n_group]
return paddle.reshape(x, new_shape) return paddle.reshape(x, new_shape)
class UpsampleNet(nn.LayerList): class UpsampleNet(nn.LayerList):
"""Layer to upsample mel spectrogram to the same temporal resolution with """Layer to upsample mel spectrogram to the same temporal resolution with
the corresponding waveform. the corresponding waveform.
@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList):
--------- ---------
``librosa.core.stft`` ``librosa.core.stft``
""" """
def __init__(self, upsample_factors): def __init__(self, upsample_factors):
super(UpsampleNet, self).__init__() super(UpsampleNet, self).__init__()
for factor in upsample_factors: for factor in upsample_factors:
@ -67,16 +84,18 @@ class UpsampleNet(nn.LayerList):
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
self.append( self.append(
nn.utils.weight_norm( nn.utils.weight_norm(
nn.Conv2DTranspose(1, 1, (3, 2 * factor), nn.Conv2DTranspose(
1,
1, (3, 2 * factor),
padding=(1, factor // 2), padding=(1, factor // 2),
stride=(1, factor), stride=(1, factor),
weight_attr=init, weight_attr=init,
bias_attr=init))) bias_attr=init)))
# upsample factors # upsample factors
self.upsample_factor = np.prod(upsample_factors) self.upsample_factor = np.prod(upsample_factors)
self.upsample_factors = upsample_factors self.upsample_factors = upsample_factors
def forward(self, x, trim_conv_artifact=False): def forward(self, x, trim_conv_artifact=False):
r"""Forward pass of the ``UpsampleNet``. r"""Forward pass of the ``UpsampleNet``.
@ -131,38 +150,47 @@ class ResidualBlock(nn.Layer):
dilations : int dilations : int
Dilations of the Convolution2d applied to the input. Dilations of the Convolution2d applied to the input.
""" """
def __init__(self, channels, cond_channels, kernel_size, dilations): def __init__(self, channels, cond_channels, kernel_size, dilations):
super(ResidualBlock, self).__init__() super(ResidualBlock, self).__init__()
# input conv # input conv
std = math.sqrt(1 / channels * np.prod(kernel_size)) std = math.sqrt(1 / channels * np.prod(kernel_size))
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)] receptive_field = [
1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
]
rh, rw = receptive_field rh, rw = receptive_field
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
conv = nn.Conv2D(channels, 2 * channels, kernel_size, conv = nn.Conv2D(
padding=paddings, channels,
dilation=dilations, 2 * channels,
weight_attr=init, kernel_size,
bias_attr=init) padding=paddings,
dilation=dilations,
weight_attr=init,
bias_attr=init)
self.conv = nn.utils.weight_norm(conv) self.conv = nn.utils.weight_norm(conv)
self.rh = rh self.rh = rh
self.rw = rw self.rw = rw
self.dilations = dilations self.dilations = dilations
# condition projection # condition projection
std = math.sqrt(1 / cond_channels) std = math.sqrt(1 / cond_channels)
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1), condition_proj = nn.Conv2D(
weight_attr=init, bias_attr=init) cond_channels,
2 * channels, (1, 1),
weight_attr=init,
bias_attr=init)
self.condition_proj = nn.utils.weight_norm(condition_proj) self.condition_proj = nn.utils.weight_norm(condition_proj)
# parametric residual & skip connection # parametric residual & skip connection
std = math.sqrt(1 / channels) std = math.sqrt(1 / channels)
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
out_proj = nn.Conv2D(channels, 2 * channels, (1, 1), out_proj = nn.Conv2D(
weight_attr=init, bias_attr=init) channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
self.out_proj = nn.utils.weight_norm(out_proj) self.out_proj = nn.utils.weight_norm(out_proj)
def forward(self, x, condition): def forward(self, x, condition):
"""Compute output for a whole folded sequence. """Compute output for a whole folded sequence.
@ -185,10 +213,10 @@ class ResidualBlock(nn.Layer):
x_in = x x_in = x
x = self.conv(x) x = self.conv(x)
x += self.condition_proj(condition) x += self.condition_proj(condition)
content, gate = paddle.chunk(x, 2, axis=1) content, gate = paddle.chunk(x, 2, axis=1)
x = paddle.tanh(content) * F.sigmoid(gate) x = paddle.tanh(content) * F.sigmoid(gate)
x = self.out_proj(x) x = self.out_proj(x)
res, skip = paddle.chunk(x, 2, axis=1) res, skip = paddle.chunk(x, 2, axis=1)
res = x_in + res res = x_in + res
@ -249,7 +277,7 @@ class ResidualBlock(nn.Layer):
content, gate = paddle.chunk(x_row, 2, axis=1) content, gate = paddle.chunk(x_row, 2, axis=1)
x_row = paddle.tanh(content) * F.sigmoid(gate) x_row = paddle.tanh(content) * F.sigmoid(gate)
x_row = self.out_proj(x_row) x_row = self.out_proj(x_row)
res, skip = paddle.chunk(x_row, 2, axis=1) res, skip = paddle.chunk(x_row, 2, axis=1)
res = x_row_in + res res = x_row_in + res
@ -290,20 +318,23 @@ class ResidualNet(nn.LayerList):
ValueError ValueError
If the length of dilations_h does not equals n_layers. If the length of dilations_h does not equals n_layers.
""" """
def __init__(self,
n_layer: int, def __init__(self,
residual_channels: int, n_layer: int,
condition_channels: int, residual_channels: int,
kernel_size: Tuple[int], condition_channels: int,
kernel_size: Tuple[int],
dilations_h: List[int]): dilations_h: List[int]):
if len(dilations_h) != n_layer: if len(dilations_h) != n_layer:
raise ValueError("number of dilations_h should equals num of layers") raise ValueError(
"number of dilations_h should equals num of layers")
super(ResidualNet, self).__init__() super(ResidualNet, self).__init__()
for i in range(n_layer): for i in range(n_layer):
dilation = (dilations_h[i], 2 ** i) dilation = (dilations_h[i], 2**i)
layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation) layer = ResidualBlock(residual_channels, condition_channels,
kernel_size, dilation)
self.append(layer) self.append(layer)
def forward(self, x, condition): def forward(self, x, condition):
"""Comput the output of given the input and the condition. """Comput the output of given the input and the condition.
@ -332,7 +363,7 @@ class ResidualNet(nn.LayerList):
""" """
for layer in self: for layer in self:
layer.start_sequence() layer.start_sequence()
def add_input(self, x_row, condition_row): def add_input(self, x_row, condition_row):
"""Compute the output for a row and update the buffers. """Compute the output for a row and update the buffers.
@ -386,33 +417,37 @@ class Flow(nn.Layer):
Number of timesteps to the folded into a group. Number of timesteps to the folded into a group.
""" """
dilations_dict = { dilations_dict = {
8: [1, 1, 1, 1, 1, 1, 1, 1], 8: [1, 1, 1, 1, 1, 1, 1, 1],
16: [1, 1, 1, 1, 1, 1, 1, 1], 16: [1, 1, 1, 1, 1, 1, 1, 1],
32: [1, 2, 4, 1, 2, 4, 1, 2], 32: [1, 2, 4, 1, 2, 4, 1, 2],
64: [1, 2, 4, 8, 16, 1, 2, 4], 64: [1, 2, 4, 8, 16, 1, 2, 4],
128: [1, 2, 4, 8, 16, 32, 64, 1] 128: [1, 2, 4, 8, 16, 32, 64, 1]
} }
def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group): def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
super(Flow, self).__init__() super(Flow, self).__init__()
# input projection # input projection
self.input_proj = nn.utils.weight_norm( self.input_proj = nn.utils.weight_norm(
nn.Conv2D(1, channels, (1, 1), nn.Conv2D(
weight_attr=I.Uniform(-1., 1.), 1,
bias_attr=I.Uniform(-1., 1.))) channels, (1, 1),
weight_attr=I.Uniform(-1., 1.),
bias_attr=I.Uniform(-1., 1.)))
# residual net # residual net
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size, self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
self.dilations_dict[n_group]) self.dilations_dict[n_group])
# output projection # output projection
self.output_proj = nn.Conv2D(channels, 2, (1, 1), self.output_proj = nn.Conv2D(
weight_attr=I.Constant(0.), channels,
bias_attr=I.Constant(0.)) 2, (1, 1),
weight_attr=I.Constant(0.),
bias_attr=I.Constant(0.))
# specs # specs
self.n_group = n_group self.n_group = n_group
def _predict_parameters(self, x, condition): def _predict_parameters(self, x, condition):
x = self.input_proj(x) x = self.input_proj(x)
x = self.resnet(x, condition) x = self.resnet(x, condition)
@ -421,11 +456,11 @@ class Flow(nn.Layer):
return logs, b return logs, b
def _transform(self, x, logs, b): def _transform(self, x, logs, b):
z_0 = x[:, :, :1, :] # the first row, just copy it z_0 = x[:, :, :1, :] # the first row, just copy it
z_out = x[:, :, 1:, :] * paddle.exp(logs) + b z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
z_out = paddle.concat([z_0, z_out], axis=2) z_out = paddle.concat([z_0, z_out], axis=2)
return z_out return z_out
def forward(self, x, condition): def forward(self, x, condition):
"""Probability density estimation. It is done by inversely transform """Probability density estimation. It is done by inversely transform
a sample from p(X) into a sample from p(Z). a sample from p(X) into a sample from p(Z).
@ -452,8 +487,8 @@ class Flow(nn.Layer):
transformation from x to z. transformation from x to z.
""" """
# (B, C, H-1, W) # (B, C, H-1, W)
logs, b = self._predict_parameters( logs, b = self._predict_parameters(x[:, :, :-1, :],
x[:, :, :-1, :], condition[:, :, 1:, :]) condition[:, :, 1:, :])
z = self._transform(x, logs, b) z = self._transform(x, logs, b)
return z, (logs, b) return z, (logs, b)
@ -467,7 +502,7 @@ class Flow(nn.Layer):
def _inverse_transform_row(self, z_row, logs, b): def _inverse_transform_row(self, z_row, logs, b):
x_row = (z_row - b) * paddle.exp(-logs) x_row = (z_row - b) * paddle.exp(-logs)
return x_row return x_row
def _inverse_row(self, z_row, x_row, condition_row): def _inverse_row(self, z_row, x_row, condition_row):
logs, b = self._predict_row_parameters(x_row, condition_row) logs, b = self._predict_row_parameters(x_row, condition_row)
x_next_row = self._inverse_transform_row(z_row, logs, b) x_next_row = self._inverse_transform_row(z_row, logs, b)
@ -475,7 +510,7 @@ class Flow(nn.Layer):
def _start_sequence(self): def _start_sequence(self):
self.resnet.start_sequence() self.resnet.start_sequence()
def inverse(self, z, condition): def inverse(self, z, condition):
"""Sampling from the the distrition p(X). It is done by sample form """Sampling from the the distrition p(X). It is done by sample form
p(Z) and transform the sample. It is a auto regressive transformation. p(Z) and transform the sample. It is a auto regressive transformation.
@ -510,15 +545,16 @@ class Flow(nn.Layer):
self._start_sequence() self._start_sequence()
for i in range(1, self.n_group): for i in range(1, self.n_group):
x_row = x[-1] # actuallt i-1:i x_row = x[-1] # actuallt i-1:i
z_row = z[:, :, i:i+1, :] z_row = z[:, :, i:i + 1, :]
condition_row = condition[:, :, i:i+1, :] condition_row = condition[:, :, i:i + 1, :]
x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row) x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
condition_row)
x.append(x_next_row) x.append(x_next_row)
logs_list.append(logs) logs_list.append(logs)
b_list.append(b) b_list.append(b)
x = paddle.concat(x, 2) x = paddle.concat(x, 2)
logs = paddle.concat(logs_list, 2) logs = paddle.concat(logs_list, 2)
b = paddle.concat(b_list, 2) b = paddle.concat(b_list, 2)
@ -549,21 +585,25 @@ class WaveFlow(nn.LayerList):
kernel_size : Union[int, List[int]] kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock. Kernel size of the convolution layer in each ResidualBlock.
""" """
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
kernel_size):
if n_group % 2 or n_flows % 2: if n_group % 2 or n_flows % 2:
raise ValueError("number of flows and number of group must be even " raise ValueError(
"since a permutation along group among flows is used.") "number of flows and number of group must be even "
"since a permutation along group among flows is used.")
super(WaveFlow, self).__init__() super(WaveFlow, self).__init__()
for _ in range(n_flows): for _ in range(n_flows):
self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group)) self.append(
Flow(n_layers, channels, mel_bands, kernel_size, n_group))
# permutations in h # permutations in h
self.perms = self._create_perm(n_group, n_flows) self.perms = self._create_perm(n_group, n_flows)
# specs # specs
self.n_group = n_group self.n_group = n_group
self.n_flows = n_flows self.n_flows = n_flows
def _create_perm(self, n_group, n_flows): def _create_perm(self, n_group, n_flows):
indices = list(range(n_group)) indices = list(range(n_group))
half = n_group // 2 half = n_group // 2
@ -572,20 +612,21 @@ class WaveFlow(nn.LayerList):
if i < n_flows // 2: if i < n_flows // 2:
perms.append(indices[::-1]) perms.append(indices[::-1])
else: else:
perm = list(reversed(indices[:half])) + list(reversed(indices[half:])) perm = list(reversed(indices[:half])) + list(
reversed(indices[half:]))
perms.append(perm) perms.append(perm)
return perms return perms
def _trim(self, x, condition): def _trim(self, x, condition):
assert condition.shape[-1] >= x.shape[-1] assert condition.shape[-1] >= x.shape[-1]
pruned_len = int(x.shape[-1] // self.n_group * self.n_group) pruned_len = int(x.shape[-1] // self.n_group * self.n_group)
if x.shape[-1] > pruned_len: if x.shape[-1] > pruned_len:
x = x[:, :pruned_len] x = x[:, :pruned_len]
if condition.shape[-1] > pruned_len: if condition.shape[-1] > pruned_len:
condition = condition[:, :, :pruned_len] condition = condition[:, :, :pruned_len]
return x, condition return x, condition
def forward(self, x, condition): def forward(self, x, condition):
"""Probability density estimation of random variable x given the """Probability density estimation of random variable x given the
condition. condition.
@ -610,21 +651,23 @@ class WaveFlow(nn.LayerList):
# x: (B, T) # x: (B, T)
# condition: (B, C, T) upsampled condition # condition: (B, C, T) upsampled condition
x, condition = self._trim(x, condition) x, condition = self._trim(x, condition)
# to (B, C, h, T//h) layout # to (B, C, h, T//h) layout
x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1) x = paddle.unsqueeze(
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
condition = paddle.transpose(
fold(condition, self.n_group), [0, 1, 3, 2])
# flows # flows
logs_list = [] logs_list = []
for i, layer in enumerate(self): for i, layer in enumerate(self):
x, (logs, b) = layer(x, condition) x, (logs, b) = layer(x, condition)
logs_list.append(logs) logs_list.append(logs)
# permute paddle has no shuffle dim # permute paddle has no shuffle dim
x = geo.shuffle_dim(x, 2, perm=self.perms[i]) x = geo.shuffle_dim(x, 2, perm=self.perms[i])
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
z = paddle.squeeze(x, 1) # (B, H, W) z = paddle.squeeze(x, 1) # (B, H, W)
batch_size = z.shape[0] batch_size = z.shape[0]
z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1]) z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList):
z, condition = self._trim(z, condition) z, condition = self._trim(z, condition)
# to (B, C, h, T//h) layout # to (B, C, h, T//h) layout
z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1) z = paddle.unsqueeze(
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
condition = paddle.transpose(
fold(condition, self.n_group), [0, 1, 3, 2])
# reverse it flow by flow # reverse it flow by flow
for i in reversed(range(self.n_flows)): for i in reversed(range(self.n_flows)):
@ -663,7 +708,7 @@ class WaveFlow(nn.LayerList):
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i]) condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
z, (logs, b) = self[i].inverse(z, condition) z, (logs, b) = self[i].inverse(z, condition)
x = paddle.squeeze(z, 1) # (B, H, W) x = paddle.squeeze(z, 1) # (B, H, W)
batch_size = x.shape[0] batch_size = x.shape[0]
x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1]) x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
return x return x
@ -695,23 +740,24 @@ class ConditionalWaveFlow(nn.LayerList):
kernel_size : Union[int, List[int]] kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock. Kernel size of the convolution layer in each ResidualBlock.
""" """
def __init__(self,
upsample_factors: List[int], def __init__(self,
n_flows: int, upsample_factors: List[int],
n_layers: int, n_flows: int,
n_group: int, n_layers: int,
channels: int, n_group: int,
n_mels: int, channels: int,
kernel_size: Union[int, List[int]]): n_mels: int,
kernel_size: Union[int, List[int]]):
super(ConditionalWaveFlow, self).__init__() super(ConditionalWaveFlow, self).__init__()
self.encoder = UpsampleNet(upsample_factors) self.encoder = UpsampleNet(upsample_factors)
self.decoder = WaveFlow( self.decoder = WaveFlow(
n_flows=n_flows, n_flows=n_flows,
n_layers=n_layers, n_layers=n_layers,
n_group=n_group, n_group=n_group,
channels=channels, channels=channels,
mel_bands=n_mels, mel_bands=n_mels,
kernel_size=kernel_size) kernel_size=kernel_size)
def forward(self, audio, mel): def forward(self, audio, mel):
"""Compute the transformed random variable z (x to z) and the log of """Compute the transformed random variable z (x to z) and the log of
@ -737,7 +783,7 @@ class ConditionalWaveFlow(nn.LayerList):
condition = self.encoder(mel) condition = self.encoder(mel)
z, log_det_jacobian = self.decoder(audio, condition) z, log_det_jacobian = self.decoder(audio, condition)
return z, log_det_jacobian return z, log_det_jacobian
@paddle.no_grad() @paddle.no_grad()
def infer(self, mel): def infer(self, mel):
r"""Generate raw audio given mel spectrogram. r"""Generate raw audio given mel spectrogram.
@ -752,12 +798,12 @@ class ConditionalWaveFlow(nn.LayerList):
Tensor : [shape=(B, T)] Tensor : [shape=(B, T)]
The synthesized audio, where``T <= T_mel \* upsample_factors``. The synthesized audio, where``T <= T_mel \* upsample_factors``.
""" """
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T) condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
batch_size, _, time_steps = condition.shape batch_size, _, time_steps = condition.shape
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype) z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
x = self.decoder.inverse(z, condition) x = self.decoder.inverse(z, condition)
return x return x
@paddle.no_grad() @paddle.no_grad()
def predict(self, mel): def predict(self, mel):
"""Generate raw audio given mel spectrogram. """Generate raw audio given mel spectrogram.
@ -777,7 +823,7 @@ class ConditionalWaveFlow(nn.LayerList):
audio = self.infer(mel) audio = self.infer(mel)
audio = audio[0].numpy() audio = audio[0].numpy()
return audio return audio
@classmethod @classmethod
def from_pretrained(cls, config, checkpoint_path): def from_pretrained(cls, config, checkpoint_path):
"""Build a ConditionalWaveFlow model from a pretrained model. """Build a ConditionalWaveFlow model from a pretrained model.
@ -795,14 +841,13 @@ class ConditionalWaveFlow(nn.LayerList):
ConditionalWaveFlow ConditionalWaveFlow
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(upsample_factors=config.model.upsample_factors,
upsample_factors=config.model.upsample_factors, n_flows=config.model.n_flows,
n_flows=config.model.n_flows, n_layers=config.model.n_layers,
n_layers=config.model.n_layers, n_group=config.model.n_group,
n_group=config.model.n_group, channels=config.model.channels,
channels=config.model.channels, n_mels=config.data.n_mels,
n_mels=config.data.n_mels, kernel_size=config.model.kernel_size)
kernel_size=config.model.kernel_size)
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
return model return model
@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer):
The standard deviation of the gaussian noise used in WaveFlow, by The standard deviation of the gaussian noise used in WaveFlow, by
default 1.0. default 1.0.
""" """
def __init__(self, sigma=1.0): def __init__(self, sigma=1.0):
super(WaveFlowLoss, self).__init__() super(WaveFlowLoss, self).__init__()
self.sigma = sigma self.sigma = sigma
@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer):
Tensor [shape=(1,)] Tensor [shape=(1,)]
The loss. The loss.
""" """
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
) - log_det_jacobian
loss = loss / np.prod(z.shape) loss = loss / np.prod(z.shape)
return loss + self.const return loss + self.const

View File

@ -18,7 +18,7 @@ from typing import Union, Sequence, List
from tqdm import trange from tqdm import trange
import numpy as np import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid.initializer as I import paddle.fluid.initializer as I
@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools
__all__ = ["WaveNet", "ConditionalWaveNet"] __all__ = ["WaveNet", "ConditionalWaveNet"]
def crop(x, audio_start, audio_length): def crop(x, audio_start, audio_length):
"""Crop the upsampled condition to match audio_length. """Crop the upsampled condition to match audio_length.
@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList):
--------- ---------
``librosa.core.stft`` ``librosa.core.stft``
""" """
def __init__(self, upscale_factors=[16, 16]): def __init__(self, upscale_factors=[16, 16]):
super(UpsampleNet, self).__init__() super(UpsampleNet, self).__init__()
self.upscale_factors = list(upscale_factors) self.upscale_factors = list(upscale_factors)
@ -106,9 +108,11 @@ class UpsampleNet(nn.LayerList):
for factor in self.upscale_factors: for factor in self.upscale_factors:
self.append( self.append(
nn.utils.weight_norm( nn.utils.weight_norm(
nn.Conv2DTranspose(1, 1, nn.Conv2DTranspose(
kernel_size=(3, 2 * factor), 1,
stride=(1, factor), 1,
kernel_size=(3, 2 * factor),
stride=(1, factor),
padding=(1, factor // 2)))) padding=(1, factor // 2))))
def forward(self, x): def forward(self, x):
@ -159,29 +163,34 @@ class ResidualBlock(nn.Layer):
dilation :int dilation :int
Dilation of the internal convolution cells. Dilation of the internal convolution cells.
""" """
def __init__(self,
residual_channels: int, def __init__(self,
condition_dim: int, residual_channels: int,
condition_dim: int,
filter_size: Union[int, Sequence[int]], filter_size: Union[int, Sequence[int]],
dilation: int): dilation: int):
super(ResidualBlock, self).__init__() super(ResidualBlock, self).__init__()
dilated_channels = 2 * residual_channels dilated_channels = 2 * residual_channels
# following clarinet's implementation, we do not have parametric residual # following clarinet's implementation, we do not have parametric residual
# & skip connection. # & skip connection.
_filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size _filter_size = filter_size[0] if isinstance(filter_size, (
list, tuple)) else filter_size
std = math.sqrt(1 / (_filter_size * residual_channels)) std = math.sqrt(1 / (_filter_size * residual_channels))
conv = Conv1dCell(residual_channels, conv = Conv1dCell(
dilated_channels, residual_channels,
filter_size, dilated_channels,
dilation=dilation, filter_size,
weight_attr=I.Normal(scale=std)) dilation=dilation,
weight_attr=I.Normal(scale=std))
self.conv = nn.utils.weight_norm(conv) self.conv = nn.utils.weight_norm(conv)
std = math.sqrt(1 / condition_dim) std = math.sqrt(1 / condition_dim)
condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), condition_proj = Conv1dCell(
weight_attr=I.Normal(scale=std)) condition_dim,
dilated_channels, (1, ),
weight_attr=I.Normal(scale=std))
self.condition_proj = nn.utils.weight_norm(condition_proj) self.condition_proj = nn.utils.weight_norm(condition_proj)
self.filter_size = filter_size self.filter_size = filter_size
@ -309,10 +318,11 @@ class ResidualNet(nn.LayerList):
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``. Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
""" """
def __init__(self,
n_stack: int, def __init__(self,
n_loop: int, n_stack: int,
residual_channels: int, n_loop: int,
residual_channels: int,
condition_dim: int, condition_dim: int,
filter_size: int): filter_size: int):
super(ResidualNet, self).__init__() super(ResidualNet, self).__init__()
@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList):
dilations = [2**i for i in range(n_loop)] * n_stack dilations = [2**i for i in range(n_loop)] * n_stack
self.context_size = 1 + sum(dilations) self.context_size = 1 + sum(dilations)
for dilation in dilations: for dilation in dilations:
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation)) self.append(
ResidualBlock(residual_channels, condition_dim, filter_size,
dilation))
def forward(self, x, condition=None): def forward(self, x, condition=None):
"""Forward pass of ``ResidualNet``. """Forward pass of ``ResidualNet``.
@ -345,7 +357,7 @@ class ResidualNet(nn.LayerList):
skip_connections = skip skip_connections = skip
else: else:
skip_connections = paddle.scale(skip_connections + skip, skip_connections = paddle.scale(skip_connections + skip,
math.sqrt(0.5)) math.sqrt(0.5))
return skip_connections return skip_connections
def start_sequence(self): def start_sequence(self):
@ -381,7 +393,7 @@ class ResidualNet(nn.LayerList):
skip_connections = skip skip_connections = skip
else: else:
skip_connections = paddle.scale(skip_connections + skip, skip_connections = paddle.scale(skip_connections + skip,
math.sqrt(0.5)) math.sqrt(0.5))
return skip_connections return skip_connections
@ -426,6 +438,7 @@ class WaveNet(nn.Layer):
This is only used for computing loss when ``loss_type`` is "mog", If This is only used for computing loss when ``loss_type`` is "mog", If
the predicted log scale is less than -9.0, it is clipped at -9.0. the predicted log scale is less than -9.0, it is clipped at -9.0.
""" """
def __init__(self, n_stack, n_loop, residual_channels, output_dim, def __init__(self, n_stack, n_loop, residual_channels, output_dim,
condition_dim, filter_size, loss_type, log_scale_min): condition_dim, filter_size, loss_type, log_scale_min):
@ -437,19 +450,24 @@ class WaveNet(nn.Layer):
else: else:
if (output_dim % 3 != 0): if (output_dim % 3 != 0):
raise ValueError( raise ValueError(
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim)) "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".
self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1) format(output_dim))
self.embed = nn.utils.weight_norm(
nn.Linear(1, residual_channels), dim=1)
self.resnet = ResidualNet(n_stack, n_loop, residual_channels, self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
condition_dim, filter_size) condition_dim, filter_size)
self.context_size = self.resnet.context_size self.context_size = self.resnet.context_size
skip_channels = residual_channels # assume the same channel skip_channels = residual_channels # assume the same channel
self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) self.proj1 = nn.utils.weight_norm(
self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) nn.Linear(skip_channels, skip_channels), dim=1)
self.proj2 = nn.utils.weight_norm(
nn.Linear(skip_channels, skip_channels), dim=1)
# if loss_type is softmax, output_dim is n_vocab of waveform magnitude. # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
# if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev) # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1) self.proj3 = nn.utils.weight_norm(
nn.Linear(skip_channels, output_dim), dim=1)
self.loss_type = loss_type self.loss_type = loss_type
self.output_dim = output_dim self.output_dim = output_dim
@ -781,26 +799,28 @@ class ConditionalWaveNet(nn.Layer):
This is only used for computing loss when ``loss_type`` is "mog", If This is only used for computing loss when ``loss_type`` is "mog", If
the predicted log scale is less than -9.0, it is clipped at -9.0. the predicted log scale is less than -9.0, it is clipped at -9.0.
""" """
def __init__(self,
upsample_factors: List[int], def __init__(self,
n_stack: int, upsample_factors: List[int],
n_loop: int, n_stack: int,
residual_channels: int, n_loop: int,
residual_channels: int,
output_dim: int, output_dim: int,
n_mels: int, n_mels: int,
filter_size: int=2, filter_size: int=2,
loss_type: str="mog", loss_type: str="mog",
log_scale_min: float=-9.0): log_scale_min: float=-9.0):
super(ConditionalWaveNet, self).__init__() super(ConditionalWaveNet, self).__init__()
self.encoder = UpsampleNet(upsample_factors) self.encoder = UpsampleNet(upsample_factors)
self.decoder = WaveNet(n_stack=n_stack, self.decoder = WaveNet(
n_loop=n_loop, n_stack=n_stack,
residual_channels=residual_channels, n_loop=n_loop,
output_dim=output_dim, residual_channels=residual_channels,
condition_dim=n_mels, output_dim=output_dim,
filter_size=filter_size, condition_dim=n_mels,
loss_type=loss_type, filter_size=filter_size,
log_scale_min=log_scale_min) loss_type=loss_type,
log_scale_min=log_scale_min)
def forward(self, audio, mel, audio_start): def forward(self, audio, mel, audio_start):
"""Compute the output distribution given the mel spectrogram and the input(for teacher force training). """Compute the output distribution given the mel spectrogram and the input(for teacher force training).
@ -895,11 +915,11 @@ class ConditionalWaveNet(nn.Layer):
self.decoder.start_sequence() self.decoder.start_sequence()
x_t = paddle.zeros((batch_size, ), dtype=mel.dtype) x_t = paddle.zeros((batch_size, ), dtype=mel.dtype)
for i in trange(time_steps): for i in trange(time_steps):
c_t = condition[:, :, i] # (B, C) c_t = condition[:, :, i] # (B, C)
y_t = self.decoder.add_input(x_t, c_t) #(B, C) y_t = self.decoder.add_input(x_t, c_t) #(B, C)
y_t = paddle.unsqueeze(y_t, 1) y_t = paddle.unsqueeze(y_t, 1)
x_t = self.sample(y_t) # (B, 1) x_t = self.sample(y_t) # (B, 1)
x_t = paddle.squeeze(x_t, 1) #(B,) x_t = paddle.squeeze(x_t, 1) #(B,)
samples.append(x_t) samples.append(x_t)
samples = paddle.stack(samples, -1) samples = paddle.stack(samples, -1)
return samples return samples
@ -943,16 +963,15 @@ class ConditionalWaveNet(nn.Layer):
ConditionalWaveNet ConditionalWaveNet
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(upsample_factors=config.model.upsample_factors,
upsample_factors=config.model.upsample_factors, n_stack=config.model.n_stack,
n_stack=config.model.n_stack, n_loop=config.model.n_loop,
n_loop=config.model.n_loop, residual_channels=config.model.residual_channels,
residual_channels=config.model.residual_channels, output_dim=config.model.output_dim,
output_dim=config.model.output_dim, n_mels=config.data.n_mels,
n_mels=config.data.n_mels, filter_size=config.model.filter_size,
filter_size=config.model.filter_size, loss_type=config.model.loss_type,
loss_type=config.model.loss_type, log_scale_min=config.model.log_scale_min)
log_scale_min=config.model.log_scale_min)
layer_tools.summary(model) layer_tools.summary(model)
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
return model return model

View File

@ -1,8 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from scipy import signal from scipy import signal
import numpy as np import numpy as np
__all__ = ["quantize", "dequantize", "STFT"] __all__ = ["quantize", "dequantize", "STFT"]
@ -86,6 +100,7 @@ class STFT(nn.Layer):
Ony ``center`` and ``reflect`` padding is supported now. Ony ``center`` and ``reflect`` padding is supported now.
""" """
def __init__(self, n_fft, hop_length, win_length, window="hanning"): def __init__(self, n_fft, hop_length, win_length, window="hanning"):
super(STFT, self).__init__() super(STFT, self).__init__()
self.hop_length = hop_length self.hop_length = hop_length
@ -109,7 +124,8 @@ class STFT(nn.Layer):
(self.n_bin, 1, 1, self.n_fft)) (self.n_bin, 1, 1, self.n_fft))
w = np.concatenate([w_real, w_imag], axis=0) w = np.concatenate([w_real, w_imag], axis=0)
self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) self.weight = paddle.cast(
paddle.to_tensor(w), paddle.get_default_dtype())
def forward(self, x): def forward(self, x):
"""Compute the stft transform. """Compute the stft transform.

View File

@ -20,6 +20,7 @@ __all__ = [
"Conv1dBatchNorm", "Conv1dBatchNorm",
] ]
class Conv1dCell(nn.Conv1D): class Conv1dCell(nn.Conv1D):
"""A subclass of Conv1D layer, which can be used in an autoregressive """A subclass of Conv1D layer, which can be used in an autoregressive
decoder like an RNN cell. decoder like an RNN cell.
@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer):
epsilon : [type], optional epsilon : [type], optional
The epsilon of the BatchNorm1D layer, by default 1e-05 The epsilon of the BatchNorm1D layer, by default 1e-05
""" """
def __init__(self, def __init__(self,
in_channels, in_channels,
out_channels, out_channels,

View File

@ -1,6 +1,21 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import paddle import paddle
def shuffle_dim(x, axis, perm=None): def shuffle_dim(x, axis, perm=None):
"""Permute input tensor along aixs given the permutation or randomly. """Permute input tensor along aixs given the permutation or randomly.
@ -32,7 +47,7 @@ def shuffle_dim(x, axis, perm=None):
perm = np.array(perm) perm = np.array(perm)
else: else:
perm = np.random.permutation(size) perm = np.random.permutation(size)
perm = paddle.to_tensor(perm) perm = paddle.to_tensor(perm)
out = paddle.gather(x, perm, axis) out = paddle.gather(x, perm, axis)
return out return out

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numba import numba
import numpy as np import numpy as np
import paddle import paddle
@ -5,12 +19,13 @@ from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
__all__ = [ __all__ = [
"weighted_mean", "weighted_mean",
"masked_l1_loss", "masked_l1_loss",
"masked_softmax_with_cross_entropy", "masked_softmax_with_cross_entropy",
"diagonal_loss", "diagonal_loss",
] ]
def weighted_mean(input, weight): def weighted_mean(input, weight):
"""Weighted mean. It can also be used as masked mean. """Weighted mean. It can also be used as masked mean.
@ -88,12 +103,11 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
return loss return loss
def diagonal_loss( def diagonal_loss(attentions,
attentions, input_lengths,
input_lengths, target_lengths,
target_lengths, g=0.2,
g=0.2, multihead=False):
multihead=False):
"""A metric to evaluate how diagonal a attention distribution is. """A metric to evaluate how diagonal a attention distribution is.
It is computed for batch attention distributions. For each attention It is computed for batch attention distributions. For each attention
@ -133,6 +147,7 @@ def diagonal_loss(
else: else:
return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1)) return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
@numba.jit(nopython=True) @numba.jit(nopython=True)
def guided_attention(N, max_N, T, max_T, g): def guided_attention(N, max_N, T, max_T, g):
W = np.zeros((max_T, max_N), dtype=np.float32) W = np.zeros((max_T, max_N), dtype=np.float32)
@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g):
# (T_dec, T_enc) # (T_dec, T_enc)
return W return W
def guided_attentions(input_lengths, target_lengths, g=0.2): def guided_attentions(input_lengths, target_lengths, g=0.2):
B = len(input_lengths) B = len(input_lengths)
max_input_len = input_lengths.max() max_input_len = input_lengths.max()
@ -151,4 +167,4 @@ def guided_attentions(input_lengths, target_lengths, g=0.2):
W[b] = guided_attention(input_lengths[b], max_input_len, W[b] = guided_attention(input_lengths[b], max_input_len,
target_lengths[b], max_target_len, g) target_lengths[b], max_target_len, g)
# (B, T_dec, T_enc) # (B, T_dec, T_enc)
return W return W

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle import paddle
from paddle.fluid.layers import sequence_mask from paddle.fluid.layers import sequence_mask
@ -8,6 +22,7 @@ __all__ = [
"future_mask", "future_mask",
] ]
def id_mask(input, padding_index=0, dtype="bool"): def id_mask(input, padding_index=0, dtype="bool"):
"""Generate mask with input ids. """Generate mask with input ids.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import numpy as np import numpy as np
import paddle import paddle
@ -5,6 +19,7 @@ from paddle.nn import functional as F
__all__ = ["positional_encoding"] __all__ = ["positional_encoding"]
def positional_encoding(start_index, length, size, dtype=None): def positional_encoding(start_index, length, size, dtype=None):
r"""Generate standard positional encoding matrix. r"""Generate standard positional encoding matrix.
@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None):
dtype = dtype or paddle.get_default_dtype() dtype = dtype or paddle.get_default_dtype()
channel = np.arange(0, size, 2) channel = np.arange(0, size, 2)
index = np.arange(start_index, start_index + length, 1) index = np.arange(start_index, start_index + length, 1)
p = np.expand_dims(index, -1) / (10000 ** (channel / float(size))) p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
encodings = np.zeros([length, size]) encodings = np.zeros([length, size])
encodings[:, 0::2] = np.sin(p) encodings[:, 0::2] = np.sin(p)
encodings[:, 1::2] = np.cos(p) encodings[:, 1::2] = np.cos(p)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle import paddle
from paddle import nn from paddle import nn
@ -12,6 +26,7 @@ __all__ = [
"TransformerDecoderLayer", "TransformerDecoderLayer",
] ]
class PositionwiseFFN(nn.Layer): class PositionwiseFFN(nn.Layer):
"""A faithful implementation of Position-wise Feed-Forward Network """A faithful implementation of Position-wise Feed-Forward Network
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_. in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer):
The probability of the Dropout applied to the output of the first The probability of the Dropout applied to the output of the first
layer, by default 0. layer, by default 0.
""" """
def __init__(self,
input_size: int, def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
hidden_size: int,
dropout=0.0):
super(PositionwiseFFN, self).__init__() super(PositionwiseFFN, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size) self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, input_size) self.linear2 = nn.Linear(hidden_size, input_size)
@ -86,16 +99,17 @@ class TransformerEncoderLayer(nn.Layer):
------ ------
It uses the PostLN (post layer norm) scheme. It uses the PostLN (post layer norm) scheme.
""" """
def __init__(self, d_model, n_heads, d_ffn, dropout=0.): def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
super(TransformerEncoderLayer, self).__init__() super(TransformerEncoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
self.dropout = dropout self.dropout = dropout
def forward(self, x, mask): def forward(self, x, mask):
"""Forward pass of TransformerEncoderLayer. """Forward pass of TransformerEncoderLayer.
@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer):
""" """
context_vector, attn_weights = self.self_mha(x, x, x, mask) context_vector, attn_weights = self.self_mha(x, x, x, mask)
x = self.layer_norm1( x = self.layer_norm1(
F.dropout(x + context_vector, F.dropout(
self.dropout, x + context_vector, self.dropout, training=self.training))
training=self.training))
x = self.layer_norm2( x = self.layer_norm2(
F.dropout(x + self.ffn(x), F.dropout(
self.dropout, x + self.ffn(x), self.dropout, training=self.training))
training=self.training))
return x, attn_weights return x, attn_weights
@ -155,19 +167,20 @@ class TransformerDecoderLayer(nn.Layer):
------ ------
It uses the PostLN (post layer norm) scheme. It uses the PostLN (post layer norm) scheme.
""" """
def __init__(self, d_model, n_heads, d_ffn, dropout=0.): def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
super(TransformerDecoderLayer, self).__init__() super(TransformerDecoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6) self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
self.dropout = dropout self.dropout = dropout
def forward(self, q, k, v, encoder_mask, decoder_mask): def forward(self, q, k, v, encoder_mask, decoder_mask):
"""Forward pass of TransformerEncoderLayer. """Forward pass of TransformerEncoderLayer.
@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer):
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
Decoder-encoder cross attention. Decoder-encoder cross attention.
""" """
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) context_vector, self_attn_weights = self.self_mha(q, q, q,
decoder_mask)
q = self.layer_norm1( q = self.layer_norm1(
F.dropout(q + context_vector, F.dropout(
self.dropout, q + context_vector, self.dropout, training=self.training))
training=self.training))
context_vector, cross_attn_weights = self.cross_mha(q, k, v,
context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask) encoder_mask)
q = self.layer_norm2( q = self.layer_norm2(
F.dropout(q + context_vector, F.dropout(
self.dropout, q + context_vector, self.dropout, training=self.training))
training=self.training))
q = self.layer_norm3( q = self.layer_norm3(
F.dropout(q + self.ffn(q), F.dropout(
self.dropout, q + self.ffn(q), self.dropout, training=self.training))
training=self.training))
return q, self_attn_weights, cross_attn_weights return q, self_attn_weights, cross_attn_weights

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.training.cli import * from parakeet.training.cli import *
from parakeet.training.experiment import * from parakeet.training.experiment import *

View File

@ -1,5 +1,20 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
def default_argument_parser(): def default_argument_parser():
r"""A simple yet genral argument parser for experiments with parakeet. r"""A simple yet genral argument parser for experiments with parakeet.
@ -46,5 +61,5 @@ def default_argument_parser():
# overwrite extra config and default config # overwrite extra config and default config
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
# yapd: enable # yapd: enable
return parser return parser

View File

@ -1,12 +1,26 @@
from yacs.config import CfgNode # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode
_C = CfgNode( _C = CfgNode(
dict( dict(
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=900000, # max iteration to train max_iteration=900000, # max iteration to train
) ))
)
def get_default_training_config(): def get_default_training_config():
return _C.clone() return _C.clone()

View File

@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools
__all__ = ["ExperimentBase"] __all__ = ["ExperimentBase"]
class ExperimentBase(object): class ExperimentBase(object):
""" """
An experiment template in order to structure the training code and take An experiment template in order to structure the training code and take

View File

@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
return iteration return iteration
def _save_checkpoint(checkpoint_dir: str, iteration: int): def _save_checkpoint(checkpoint_dir: str, iteration: int):
"""Save the iteration number of the latest model to be checkpointed. """Save the iteration number of the latest model to be checkpointed.
@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
with open(checkpoint_record, "wt") as handle: with open(checkpoint_record, "wt") as handle:
handle.write("model_checkpoint_path: step-{}".format(iteration)) handle.write("model_checkpoint_path: step-{}".format(iteration))
def load_parameters(model, def load_parameters(model,
optimizer=None, optimizer=None,
checkpoint_dir=None, checkpoint_dir=None,
@ -97,18 +99,19 @@ def load_parameters(model,
params_path = checkpoint_path + ".pdparams" params_path = checkpoint_path + ".pdparams"
model_dict = paddle.load(params_path) model_dict = paddle.load(params_path)
model.set_state_dict(model_dict) model.set_state_dict(model_dict)
print("[checkpoint] Rank {}: loaded model from {}".format( print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
local_rank, params_path)) params_path))
optimizer_path = checkpoint_path + ".pdopt" optimizer_path = checkpoint_path + ".pdopt"
if optimizer and os.path.isfile(optimizer_path): if optimizer and os.path.isfile(optimizer_path):
optimizer_dict = paddle.load(optimizer_path) optimizer_dict = paddle.load(optimizer_path)
optimizer.set_state_dict(optimizer_dict) optimizer.set_state_dict(optimizer_dict)
print("[checkpoint] Rank {}: loaded optimizer state from {}". print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
format(local_rank, optimizer_path)) local_rank, optimizer_path))
return iteration return iteration
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def save_parameters(checkpoint_dir, iteration, model, optimizer=None): def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
"""Checkpoint the latest trained model parameters. """Checkpoint the latest trained model parameters.
@ -124,7 +127,7 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
None None
""" """
checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration)) checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
model_dict = model.state_dict() model_dict = model.state_dict()
params_path = checkpoint_path + ".pdparams" params_path = checkpoint_path + ".pdparams"
paddle.save(model_dict, params_path) paddle.save(model_dict, params_path)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle.framework import core from paddle.framework import core

View File

@ -28,6 +28,7 @@ def summary(layer: nn.Layer):
print("layer has {} parameters, {} elements.".format(num_params, print("layer has {} parameters, {} elements.".format(num_params,
num_elements)) num_elements))
def gradient_norm(layer: nn.Layer): def gradient_norm(layer: nn.Layer):
grad_norm_dict = {} grad_norm_dict = {}
for name, param in layer.state_dict().items(): for name, param in layer.state_dict().items():
@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer):
grad_norm_dict[name] = np.linalg.norm(grad) / grad.size grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
return grad_norm_dict return grad_norm_dict
def recursively_remove_weight_norm(layer: nn.Layer): def recursively_remove_weight_norm(layer: nn.Layer):
for layer in layer.sublayers(): for layer in layer.sublayers():
try: try:
@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer):
# ther is not weight norm hoom in this layer # ther is not weight norm hoom in this layer
pass pass
def freeze(layer: nn.Layer): def freeze(layer: nn.Layer):
for param in layer.parameters(): for param in layer.parameters():
param.trainable = False param.trainable = False
def unfreeze(layer: nn.Layer): def unfreeze(layer: nn.Layer):
for param in layer.parameters(): for param in layer.parameters():
param.trainable = True param.trainable = True

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from functools import wraps from functools import wraps
@ -11,11 +25,8 @@ def rank_zero_only(func):
@wraps(func) @wraps(func)
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
if local_rank != 0: if local_rank != 0:
return return
result = func(*args, **kwargs) result = func(*args, **kwargs)
return result return result
return wrapper return wrapper

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
__all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"] __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
@ -24,7 +38,7 @@ class PieceWise(SchedulerBase):
self.xs = [item[0] for item in anchors] self.xs = [item[0] for item in anchors]
self.ys = [item[1] for item in anchors] self.ys = [item[1] for item in anchors]
self.num_anchors = len(self.xs) self.num_anchors = len(self.xs)
def __call__(self, step): def __call__(self, step):
i = 0 i = 0
for x in self.xs: for x in self.xs:
@ -34,8 +48,8 @@ class PieceWise(SchedulerBase):
return self.ys[0] return self.ys[0]
if i == self.num_anchors: if i == self.num_anchors:
return self.ys[-1] return self.ys[-1]
k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1]) k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1])
out = self.ys[i-1] + (step - self.xs[i-1]) * k out = self.ys[i - 1] + (step - self.xs[i - 1]) * k
return out return out
@ -47,7 +61,7 @@ class StepWise(SchedulerBase):
self.xs = [item[0] for item in anchors] self.xs = [item[0] for item in anchors]
self.ys = [item[1] for item in anchors] self.ys = [item[1] for item in anchors]
self.num_anchors = len(self.xs) self.num_anchors = len(self.xs)
def __call__(self, step): def __call__(self, step):
i = 0 i = 0
for x in self.xs: for x in self.xs:
@ -58,5 +72,4 @@ class StepWise(SchedulerBase):
return self.ys[-1] return self.ys[-1]
if i == 0: if i == 0:
return self.ys[0] return self.ys[0]
return self.ys[i-1] return self.ys[i - 1]

View File

@ -48,7 +48,6 @@ setup_info = dict(
description='Speech synthesis tools and models based on Paddlepaddle', description='Speech synthesis tools and models based on Paddlepaddle',
long_description=long_description, long_description=long_description,
license='Apache 2', license='Apache 2',
python_requires='>=3.6', python_requires='>=3.6',
install_requires=[ install_requires=[
'numpy', 'numpy',
@ -64,30 +63,25 @@ setup_info = dict(
'scipy', 'scipy',
'pandas', 'pandas',
'sox', 'sox',
'opencc', # 'opencc',
'soundfile', 'soundfile',
'g2p_en', 'g2p_en',
'g2pM', 'g2pM',
'yacs', 'yacs',
'tensorboardX', 'tensorboardX',
], ],
extras_require={ extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
},
# Package info # Package info
packages=find_packages(exclude=('tests', 'tests.*')), packages=find_packages(exclude=('tests', 'tests.*')),
zip_safe=True, zip_safe=True,
classifiers=[
classifiers = [
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Intended Audience :: Developers', 'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Artificial Intelligence' 'Topic :: Scientific/Engineering :: Artificial Intelligence'
'License :: OSI Approved :: Apache2 License', 'License :: OSI Approved :: Apache2 License',
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.7',
], ], )
)
setup(**setup_info) setup(**setup_info)

View File

@ -1,101 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import attention as attn
class TestScaledDotProductAttention(unittest.TestCase):
def test_without_mask(self):
x = paddle.randn([4, 16, 8])
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
assert(list(context_vector.shape) == [4, 16, 8])
assert(list(attention_weights.shape) == [4, 16, 16])
def test_with_mask(self):
x = paddle.randn([4, 16, 8])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([16, 15, 13, 14]), dtype=x.dtype)
mask = mask.unsqueeze(1) # unsqueeze for the decoder time steps
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x, mask)
assert(list(context_vector.shape) == [4, 16, 8])
assert(list(attention_weights.shape) == [4, 16, 16])
def test_4d(self):
x = paddle.randn([4, 6, 16, 8])
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
assert(list(context_vector.shape) == [4, 6, 16, 8])
assert(list(attention_weights.shape) == [4, 6, 16, 16])
class TestMonoheadAttention(unittest.TestCase):
def test_io(self):
net = attn.MonoheadAttention(6, 0.1)
q = paddle.randn([4, 18, 6])
k = paddle.randn([4, 12, 6])
v = paddle.randn([4, 12, 6])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attn_weights = net(q, k, v, mask)
self.assertTupleEqual(context_vector.numpy().shape, (4, 18, 6))
self.assertTupleEqual(attn_weights.numpy().shape, (4, 18, 12))
class TestDropHead(unittest.TestCase):
def test_drop(self):
x = paddle.randn([4, 6, 16, 8])
out = attn.drop_head(x, 2, training=True)
# drop 2 head from 6 at all positions
np.testing.assert_allclose(np.sum(out.numpy() == 0., axis=1), 2)
def test_drop_all(self):
x = paddle.randn([4, 6, 16, 8])
out = attn.drop_head(x, 6, training=True)
np.testing.assert_allclose(np.sum(out.numpy()), 0)
def test_eval(self):
x = paddle.randn([4, 6, 16, 8])
out = attn.drop_head(x, 6, training=False)
self.assertIs(x, out)
class TestMultiheadAttention(unittest.TestCase):
def __init__(self, methodName="test_io", same_qk=True):
super(TestMultiheadAttention, self).__init__(methodName)
self.same_qk = same_qk
def setUp(self):
if self.same_qk:
net = attn.MultiheadAttention(64, 8, dropout=0.3)
else:
net = attn.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
self.net =net
def test_io(self):
q = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attention_weights = self.net(q, q, q, mask)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestScaledDotProductAttention("test_without_mask"))
suite.addTest(TestScaledDotProductAttention("test_with_mask"))
suite.addTest(TestScaledDotProductAttention("test_4d"))
suite.addTest(TestDropHead("test_drop"))
suite.addTest(TestDropHead("test_drop_all"))
suite.addTest(TestDropHead("test_eval"))
suite.addTest(TestMonoheadAttention("test_io"))
suite.addTest(TestMultiheadAttention("test_io", same_qk=True))
suite.addTest(TestMultiheadAttention("test_io", same_qk=False))
return suite

View File

@ -1,34 +0,0 @@
import unittest
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import cbhg
class TestHighway(unittest.TestCase):
def test_io(self):
net = cbhg.Highway(4)
x = paddle.randn([2, 12, 4])
y = net(x)
self.assertTupleEqual(y.numpy().shape, (2, 12, 4))
class TestCBHG(unittest.TestCase):
def __init__(self, methodName="runTest", ):
super(TestCBHG, self).__init__(methodName)
def test_io(self):
self.net = cbhg.CBHG(64, 32, 16,
projection_channels=[64, 128],
num_highways=4, highway_features=128,
gru_features=64)
x = paddle.randn([4, 64, 32])
y = self.net(x)
self.assertTupleEqual(y.numpy().shape, (4, 32, 128))
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestHighway("test_io"))
suite.addTest(TestCBHG("test_io"))
return suite

View File

@ -1,43 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import clarinet
from parakeet.modules import stft
class TestParallelWaveNet(unittest.TestCase):
def test_io(self):
net = clarinet.ParallelWaveNet([8, 8, 8], [1, 1, 1], 16, 12, 2)
x = paddle.randn([4, 6073])
condition = paddle.randn([4, 12, 6073])
z, out_mu, out_log_std = net(x, condition)
self.assertTupleEqual(z.numpy().shape, (4, 6073))
self.assertTupleEqual(out_mu.numpy().shape, (4, 6073))
self.assertTupleEqual(out_log_std.numpy().shape, (4, 6073))
class TestClariNet(unittest.TestCase):
def setUp(self):
encoder = clarinet.UpsampleNet([2, 2])
teacher = clarinet.WaveNet(8, 3, 16, 3, 12, 2, "mog", -9.0)
student = clarinet.ParallelWaveNet([8, 8, 8, 8, 8, 8], [1, 1, 1, 1, 1, 1], 16, 12, 2)
stft_module = stft.STFT(16, 4, 8)
net = clarinet.Clarinet(encoder, teacher, student, stft_module, -6.0, lmd=4)
print("context size is: ", teacher.context_size)
self.net = net
def test_io(self):
audio = paddle.randn([4, 1366])
mel = paddle.randn([4, 12, 512]) # 512 * 4 =2048
audio_start = paddle.zeros([4], dtype="int64")
loss = self.net(audio, mel, audio_start, clip_kl=True)
loss["loss"].numpy()
def test_synthesis(self):
mel = paddle.randn([4, 12, 512]) # 64 = 246 / 4
out = self.net.synthesis(mel)
self.assertTupleEqual(out.numpy().shape, (4, 2048))

View File

@ -1,33 +0,0 @@
import unittest
import paddle
from paddle import nn
paddle.disable_static(paddle.CPUPlace())
paddle.set_default_dtype("float64")
from parakeet.modules import connections as conn
class TestPreLayerNormWrapper(unittest.TestCase):
def test_io(self):
net = nn.Linear(8, 8)
net = conn.PreLayerNormWrapper(net, 8)
x = paddle.randn([4, 8])
y = net(x)
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
class TestPostLayerNormWrapper(unittest.TestCase):
def test_io(self):
net = nn.Linear(8, 8)
net = conn.PostLayerNormWrapper(net, 8)
x = paddle.randn([4, 8])
y = net(x)
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
class TestResidualWrapper(unittest.TestCase):
def test_io(self):
net = nn.Linear(8, 8)
net = conn.ResidualWrapper(net)
x = paddle.randn([4, 8])
y = net(x)
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)

View File

@ -1,67 +0,0 @@
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
import unittest
import numpy as np
from parakeet.modules import conv
class TestConv1dCell(unittest.TestCase):
def setUp(self):
self.net = conv.Conv1dCell(4, 6, 5, dilation=2)
def forward_incremental(self, x):
outs = []
self.net.start_sequence()
with paddle.no_grad():
for i in range(x.shape[-1]):
xt = x[:, :, i]
yt = self.net.add_input(xt)
outs.append(yt)
y2 = paddle.stack(outs, axis=-1)
return y2
def test_equality(self):
x = paddle.randn([2, 4, 16])
y1 = self.net(x)
self.net.eval()
y2 = self.forward_incremental(x)
np.testing.assert_allclose(y2.numpy(), y1.numpy())
class TestConv1dBatchNorm(unittest.TestCase):
def __init__(self, methodName="runTest", causal=False, channel_last=False):
super(TestConv1dBatchNorm, self).__init__(methodName)
self.causal = causal
self.channel_last = channel_last
def setUp(self):
k = 5
paddding = (k - 1, 0) if self.causal else ((k-1) // 2, k //2)
self.net = conv.Conv1dBatchNorm(4, 6, (k,), 1, padding=paddding,
data_format="NLC" if self.channel_last else "NCL")
def test_input_output(self):
x = paddle.randn([4, 16, 4]) if self.channel_last else paddle.randn([4, 4, 16])
out = self.net(x)
out_np = out.numpy()
if self.channel_last:
self.assertTupleEqual(out_np.shape, (4, 16, 6))
else:
self.assertTupleEqual(out_np.shape, (4, 6, 16))
def runTest(self):
self.test_input_output()
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestConv1dBatchNorm("runTest", True, True))
suite.addTest(TestConv1dBatchNorm("runTest", False, False))
suite.addTest(TestConv1dBatchNorm("runTest", True, False))
suite.addTest(TestConv1dBatchNorm("runTest", False, True))
suite.addTest(TestConv1dCell("test_equality"))
return suite

View File

@ -1,122 +0,0 @@
import unittest
import numpy as np
import paddle
from paddle import io
from parakeet import data
class MyDataset(io.Dataset):
def __init__(self, size):
self._data = np.random.randn(size, 6)
def __getitem__(self, i):
return self._data[i]
def __len__(self):
return self._data.shape[0]
class TestTransformDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(20)
dataset = data.TransformDataset(dataset, lambda x: np.abs(x))
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("TransformDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestChainDataset(unittest.TestCase):
def test(self):
dataset1 = MyDataset(20)
dataset2 = MyDataset(40)
dataset = data.ChainDataset(dataset1, dataset2)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("ChainDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestTupleDataset(unittest.TestCase):
def test(self):
dataset1 = MyDataset(20)
dataset2 = MyDataset(20)
dataset = data.TupleDataset(dataset1, dataset2)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("TupleDataset")
for field1, field2 in dataloader:
print(type(field1), field1.dtype, field1.shape)
print(type(field2), field2.dtype, field2.shape)
class TestDictDataset(unittest.TestCase):
def test(self):
dataset1 = MyDataset(20)
dataset2 = MyDataset(20)
dataset = data.DictDataset(field1=dataset1, field2=dataset2)
def collate_fn(examples):
examples_tuples = []
for example in examples:
examples_tuples.append(example.values())
return paddle.fluid.dataloader.dataloader_iter.default_collate_fn(examples_tuples)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1, collate_fn=collate_fn)
print("DictDataset")
for field1, field2 in dataloader:
print(type(field1), field1.dtype, field1.shape)
print(type(field2), field2.dtype, field2.shape)
class TestSliceDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
dataset = data.SliceDataset(dataset, 0, 20)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("SliceDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestSplit(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
train, valid = data.split(dataset, 10)
dataloader1 = io.DataLoader(train, batch_size=4, shuffle=True, num_workers=1)
dataloader2 = io.DataLoader(valid, batch_size=4, shuffle=True, num_workers=1)
print("First Dataset")
for batch, in dataloader1:
print(type(batch), batch.dtype, batch.shape)
print("Second Dataset")
for batch, in dataloader2:
print(type(batch), batch.dtype, batch.shape)
class TestSubsetDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
indices = np.random.choice(np.arange(40), [20], replace=False).tolist()
dataset = data.SubsetDataset(dataset, indices)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("SubsetDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestFilterDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
dataset = data.FilterDataset(dataset, lambda x: np.mean(x)> 0.3)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("FilterDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestCacheDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
dataset = data.CacheDataset(dataset)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("CacheDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)

View File

@ -1,107 +0,0 @@
import numpy as np
import unittest
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import deepvoice3 as dv3
class TestConvBlock(unittest.TestCase):
def test_io_causal(self):
net = dv3.ConvBlock(6, 5, True, True, 8, 0.9)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
# TODO(chenfeiyu): to report an issue on default data type
padding = paddle.zeros([4, 4, 6], dtype=x.dtype)
y = net.forward(x, condition, padding)
self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
def test_io_non_causal(self):
net = dv3.ConvBlock(6, 5, False, True, 8, 0.9)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
y = net.forward(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
class TestAffineBlock1(unittest.TestCase):
def test_io(self):
net = dv3.AffineBlock1(6, 16, True, 8)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
class TestAffineBlock2(unittest.TestCase):
def test_io(self):
net = dv3.AffineBlock2(6, 16, True, 8)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
class TestEncoder(unittest.TestCase):
def test_io(self):
net = dv3.Encoder(5, 8, 16, 5, True, 6)
x = paddle.randn([4, 32, 8])
condition = paddle.randn([4, 6])
keys, values = net(x, condition)
self.assertTupleEqual(keys.numpy().shape, (4, 32, 8))
self.assertTupleEqual(values.numpy().shape, (4, 32, 8))
class TestAttentionBlock(unittest.TestCase):
def test_io(self):
net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
q = paddle.randn([4, 32, 6])
k = paddle.randn([4, 24, 6])
v = paddle.randn([4, 24, 6])
lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
condition = paddle.randn([4, 8])
context_vector, attention_weight = net(q, k, v, lengths, condition, 0)
self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
def test_io_with_previous_attn(self):
net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
q = paddle.randn([4, 32, 6])
k = paddle.randn([4, 24, 6])
v = paddle.randn([4, 24, 6])
lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
condition = paddle.randn([4, 8])
prev_attn_weight = paddle.randn([4, 32, 16])
context_vector, attention_weight = net(
q, k, v, lengths, condition, 0,
force_monotonic=True, prev_coeffs=prev_attn_weight, window=(0, 4))
self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
class TestDecoder(unittest.TestCase):
def test_io(self):
net = dv3.Decoder(8, 4, [4, 12], 5, 3, 16, 1.0, 1.45, True, 6)
x = paddle.randn([4, 32, 8])
k = paddle.randn([4, 24, 12]) # prenet's last size should equals k's feature size
v = paddle.randn([4, 24, 12])
lengths = paddle.to_tensor([24, 18, 19, 22])
condition = paddle.randn([4, 6])
decoded, hidden, attentions, final_state = net(x, k, v, lengths, 0, condition)
self.assertTupleEqual(decoded.numpy().shape, (4, 32, 4 * 8))
self.assertTupleEqual(hidden.numpy().shape, (4, 32, 12))
self.assertEqual(len(attentions), 5)
self.assertTupleEqual(attentions[0].numpy().shape, (4, 32, 24))
self.assertEqual(len(final_state), 5)
self.assertTupleEqual(final_state[0].numpy().shape, (4, 2, 12))
class TestPostNet(unittest.TestCase):
def test_io(self):
net = dv3.PostNet(3, 8, 16, 3, 12, 4, True, 6)
x = paddle.randn([4, 32, 8])
condition = paddle.randn([4, 6])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32 * 4, 12))

View File

@ -1,19 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import geometry as geo
class TestShuffleDim(unittest.TestCase):
def test_perm(self):
x = paddle.randn([2, 3, 4, 6])
y = geo.shuffle_dim(x, 2, [3, 2, 1, 0])
np.testing.assert_allclose(x.numpy()[0, 0, :, 0], y.numpy()[0, 0, ::-1, 0])
def test_random_perm(self):
x = paddle.randn([2, 3, 4, 6])
y = geo.shuffle_dim(x, 2)
np.testing.assert_allclose(x.numpy().sum(2), y.numpy().sum(2))

View File

@ -1,33 +0,0 @@
import unittest
import paddle
paddle.set_device("cpu")
import numpy as np
from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy
class TestWeightedMean(unittest.TestCase):
def test(self):
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
loss = weighted_mean(x, mask)
self.assertAlmostEqual(loss.numpy()[0], 7)
class TestMaskedL1Loss(unittest.TestCase):
def test(self):
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
y = paddle.zeros_like(x)
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
loss = masked_l1_loss(x, y, mask)
print(loss)
self.assertAlmostEqual(loss.numpy()[0], 7)
class TestMaskedCrossEntropy(unittest.TestCase):
def test(self):
x = paddle.randn([3, 30, 8], dtype="float64")
y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1)
loss = masked_softmax_with_cross_entropy(x, y, mask)
print(loss)

View File

@ -1,54 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
from parakeet.modules import masking
def sequence_mask(lengths, max_length=None, dtype="bool"):
max_length = max_length or np.max(lengths)
ids = np.arange(max_length)
return (ids < np.expand_dims(lengths, -1)).astype(dtype)
def future_mask(lengths, max_length=None, dtype="bool"):
max_length = max_length or np.max(lengths)
return np.tril(np.tril(np.ones(max_length))).astype(dtype)
class TestIDMask(unittest.TestCase):
def test(self):
ids = paddle.to_tensor(
[[1, 2, 3, 0, 0, 0],
[2, 4, 5, 6, 0, 0],
[7, 8, 9, 0, 0, 0]]
)
mask = masking.id_mask(ids)
self.assertTupleEqual(mask.numpy().shape, ids.numpy().shape)
print(mask.numpy())
class TestFeatureMask(unittest.TestCase):
def test(self):
features = np.random.randn(3, 16, 8)
lengths = [16, 14, 12]
for i, length in enumerate(lengths):
features[i, length:, :] = 0
feature_tensor = paddle.to_tensor(features)
mask = masking.feature_mask(feature_tensor, -1)
self.assertTupleEqual(mask.numpy().shape, (3, 16, 1))
print(mask.numpy().squeeze())
class TestCombineMask(unittest.TestCase):
def test_bool_mask(self):
lengths = np.array([12, 8, 9, 10])
padding_mask = sequence_mask(lengths, dtype="bool")
no_future_mask = future_mask(lengths, dtype="bool")
combined_mask1 = np.expand_dims(padding_mask, 1) * no_future_mask
print(paddle.to_tensor(padding_mask).dtype)
print(paddle.to_tensor(no_future_mask).dtype)
combined_mask2 = masking.combine_mask(
paddle.to_tensor(padding_mask).unsqueeze(1), paddle.to_tensor(no_future_mask)
)
np.testing.assert_allclose(combined_mask2.numpy(), combined_mask1)

View File

@ -1,64 +0,0 @@
import unittest
import numpy as np
import paddle
from parakeet.modules import positional_encoding as pe
def positional_encoding(start_index, length, size, dtype="float32"):
if (size % 2 != 0):
raise ValueError("size should be divisible by 2")
channel = np.arange(0, size, 2, dtype=dtype)
index = np.arange(start_index, start_index + length, 1, dtype=dtype)
p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
return encodings
def scalable_positional_encoding(start_index, length, size, omega):
dtype = omega.dtype
index = np.arange(start_index, start_index + length, 1, dtype=dtype)
channel = np.arange(0, size, 2, dtype=dtype)
p = np.reshape(omega, omega.shape + (1, 1)) \
* np.expand_dims(index, -1) \
/ (10000 ** (channel / float(size)))
encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
return encodings
class TestPositionEncoding(unittest.TestCase):
def __init__(self, start=0, length=20, size=16, dtype="float64"):
super(TestPositionEncoding, self).__init__("runTest")
self.spec = (start, length, size, dtype)
def test_equality(self):
start, length, size, dtype = self.spec
position_embed1 = positional_encoding(start, length, size, dtype)
position_embed2 = pe.positional_encoding(start, length, size, dtype)
np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
def runTest(self):
paddle.disable_static(paddle.CPUPlace())
self.test_equality()
class TestScalablePositionEncoding(unittest.TestCase):
def __init__(self, start=0, length=20, size=16, dtype="float64"):
super(TestScalablePositionEncoding, self).__init__("runTest")
self.spec = (start, length, size, dtype)
def test_equality(self):
start, length, size, dtype = self.spec
omega = np.random.uniform(1, 2, size=(4,)).astype(dtype)
position_embed1 = scalable_positional_encoding(start, length, size, omega)
position_embed2 = pe.scalable_positional_encoding(start, length, size, paddle.to_tensor(omega))
np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
def runTest(self):
paddle.disable_static(paddle.CPUPlace())
self.test_equality()
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestPositionEncoding(0, 20, 16, "float64"))
suite.addTest(TestScalablePositionEncoding(0, 20, 16))
return suite

View File

@ -1,27 +0,0 @@
import unittest
import numpy as np
import librosa
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import stft
class TestSTFT(unittest.TestCase):
def test(self):
path = librosa.util.example("choice")
wav, sr = librosa.load(path, duration=5)
wav = wav.astype("float64")
spec = librosa.stft(wav, n_fft=2048, hop_length=256, win_length=1024)
mag1 = np.abs(spec)
wav_in_batch = paddle.unsqueeze(paddle.to_tensor(wav), 0)
mag2 = stft.STFT(2048, 256, 1024).magnitude(wav_in_batch)
mag2 = paddle.squeeze(mag2, [0, 2]).numpy()
print("mag1", mag1)
print("mag2", mag2)
# TODO(chenfeiyu): Is there something wrong? there is some elements that
# does not match
# np.testing.assert_allclose(mag2, mag1)

View File

@ -1,43 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import transformer
class TestPositionwiseFFN(unittest.TestCase):
def test_io(self):
net = transformer.PositionwiseFFN(8, 12)
x = paddle.randn([2, 3, 4, 8])
y = net(x)
self.assertTupleEqual(y.numpy().shape, (2, 3, 4, 8))
class TestTransformerEncoderLayer(unittest.TestCase):
def test_io(self):
net = transformer.TransformerEncoderLayer(64, 8, 128, 0.5)
x = paddle.randn([4, 12, 64])
lengths = paddle.to_tensor([12, 8, 9, 10])
mask = paddle.fluid.layers.sequence_mask(lengths, dtype=x.dtype)
y, attn_weights = net(x, mask)
self.assertTupleEqual(y.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attn_weights.numpy().shape, (4, 8, 12, 12))
class TestTransformerDecoderLayer(unittest.TestCase):
def test_io(self):
net = transformer.TransformerDecoderLayer(64, 8, 128, 0.5)
q = paddle.randn([4, 32, 64])
k = paddle.randn([4, 24, 64])
v = paddle.randn([4, 24, 64])
enc_lengths = paddle.to_tensor([24, 18, 20, 22])
dec_lengths = paddle.to_tensor([32, 28, 30, 31])
enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype)
dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype)
y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))

View File

@ -1,121 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import transformer_tts as tts
from parakeet.modules import masking
from pprint import pprint
class TestMultiheadAttention(unittest.TestCase):
def test_io_same_qk(self):
net = tts.MultiheadAttention(64, 8)
q = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
def test_io(self):
net = tts.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
q = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
class TestTransformerEncoderLayer(unittest.TestCase):
def test_io(self):
net = tts.TransformerEncoderLayer(64, 8, 128)
x = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=x.dtype)
context_vector, attention_weights = net(x, mask)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
class TestTransformerDecoderLayer(unittest.TestCase):
def test_io(self):
net = tts.TransformerDecoderLayer(64, 8, 128, 0.5)
q = paddle.randn([4, 32, 64])
k = paddle.randn([4, 24, 64])
v = paddle.randn([4, 24, 64])
enc_lengths = paddle.to_tensor([24, 18, 20, 22])
dec_lengths = paddle.to_tensor([32, 28, 30, 31])
enc_mask = masking.sequence_mask(enc_lengths, dtype=k.dtype)
dec_padding_mask = masking.sequence_mask(dec_lengths, dtype=q.dtype)
no_future_mask = masking.future_mask(32, dtype=q.dtype)
dec_mask = masking.combine_mask(dec_padding_mask.unsqueeze(-1), no_future_mask)
y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
class TestTransformerTTS(unittest.TestCase):
def setUp(self):
net = tts.TransformerTTS(
128, 0, 64, 128, 80, 4, 128,
6, 6, 128, 128, 4,
3, 10, 0.1)
self.net = net
def test_encode_io(self):
net = self.net
text = paddle.randint(0, 128, [4, 176])
lengths = paddle.to_tensor([176, 156, 174, 168])
mask = masking.sequence_mask(lengths, dtype=text.dtype)
text = text * mask
encoded, attention_weights, encoder_mask = net.encode(text)
print("output shapes:")
print("encoded:", encoded.numpy().shape)
print("encoder_attentions:", [item.shape for item in attention_weights])
print("encoder_mask:", encoder_mask.numpy().shape)
def test_all_io(self):
net = self.net
text = paddle.randint(0, 128, [4, 176])
lengths = paddle.to_tensor([176, 156, 174, 168])
mask = masking.sequence_mask(lengths, dtype=text.dtype)
text = text * mask
mel = paddle.randn([4, 189, 80])
frames = paddle.to_tensor([189, 186, 179, 174])
mask = masking.sequence_mask(frames, dtype=frames.dtype)
mel = mel * mask.unsqueeze(-1)
encoded, encoder_attention_weights, encoder_mask = net.encode(text)
mel_output, mel_intermediate, cross_attention_weights, stop_logits = net.decode(encoded, mel, encoder_mask)
print("output shapes:")
print("encoder_output:", encoded.numpy().shape)
print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
print("encoder_mask:", encoder_mask.numpy().shape)
print("mel_output: ", mel_output.numpy().shape)
print("mel_intermediate: ", mel_intermediate.numpy().shape)
print("decoder_attentions:", [item.shape for item in cross_attention_weights])
print("stop_logits:", stop_logits.numpy().shape)
def test_predict_io(self):
net = self.net
net.eval()
with paddle.no_grad():
text = paddle.randint(0, 128, [176])
decoder_output, encoder_attention_weights, cross_attention_weights = net.predict(text)
print("output shapes:")
print("mel_output: ", decoder_output.numpy().shape)
print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
print("decoder_attentions:", [item.shape for item in cross_attention_weights])

View File

@ -1,130 +0,0 @@
import numpy as np
import unittest
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import waveflow
class TestFold(unittest.TestCase):
def test_audio(self):
x = paddle.randn([4, 32 * 8])
y = waveflow.fold(x, 8)
self.assertTupleEqual(y.numpy().shape, (4, 32, 8))
def test_spec(self):
x = paddle.randn([4, 80, 32 * 8])
y = waveflow.fold(x, 8)
self.assertTupleEqual(y.numpy().shape, (4, 80, 32, 8))
class TestUpsampleNet(unittest.TestCase):
def test_io(self):
net = waveflow.UpsampleNet([2, 2])
x = paddle.randn([4, 8, 6])
y = net(x)
self.assertTupleEqual(y.numpy().shape, (4, 8, 2 * 2 * 6))
class TestResidualBlock(unittest.TestCase):
def test_io(self):
net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
x = paddle.randn([4, 4, 16, 32])
condition = paddle.randn([4, 6, 16, 32])
res, skip = net(x, condition)
self.assertTupleEqual(res.numpy().shape, (4, 4, 16, 32))
self.assertTupleEqual(skip.numpy().shape, (4, 4, 16, 32))
def test_add_input(self):
net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
net.eval()
net.start_sequence()
x_row = paddle.randn([4, 4, 1, 32])
condition_row = paddle.randn([4, 6, 1, 32])
res, skip = net.add_input(x_row, condition_row)
self.assertTupleEqual(res.numpy().shape, (4, 4, 1, 32))
self.assertTupleEqual(skip.numpy().shape, (4, 4, 1, 32))
class TestResidualNet(unittest.TestCase):
def test_io(self):
net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
x = paddle.randn([4, 6, 8, 32])
condition = paddle.randn([4, 8, 8, 32])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 6, 8, 32))
def test_add_input(self):
net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
net.eval()
net.start_sequence()
x_row = paddle.randn([4, 6, 1, 32])
condition_row = paddle.randn([4, 8, 1, 32])
y_row = net.add_input(x_row, condition_row)
self.assertTupleEqual(y_row.numpy().shape, (4, 6, 1, 32))
class TestFlow(unittest.TestCase):
def test_io(self):
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
x = paddle.randn([4, 1, 8, 32])
condition = paddle.randn([4, 7, 8, 32])
z, (logs, b) = net(x, condition)
self.assertTupleEqual(z.numpy().shape, (4, 1, 8, 32))
self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
def test_inverse_row(self):
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
net.eval()
net._start_sequence()
x_row = paddle.randn([4, 1, 1, 32]) # last row
condition_row = paddle.randn([4, 7, 1, 32])
z_row = paddle.randn([4, 1, 1, 32])
x_next_row, (logs, b) = net._inverse_row(z_row, x_row, condition_row)
self.assertTupleEqual(x_next_row.numpy().shape, (4, 1, 1, 32))
self.assertTupleEqual(logs.numpy().shape, (4, 1, 1, 32))
self.assertTupleEqual(b.numpy().shape, (4, 1, 1, 32))
def test_inverse(self):
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
net.eval()
z = paddle.randn([4, 1, 8, 32])
condition = paddle.randn([4, 7, 8, 32])
with paddle.no_grad():
x, (logs, b) = net.inverse(z, condition)
self.assertTupleEqual(x.numpy().shape, (4, 1, 8, 32))
self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
class TestWaveFlow(unittest.TestCase):
def test_io(self):
x = paddle.randn([4, 32 * 8 ])
condition = paddle.randn([4, 7, 32 * 8])
net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
z, logs_det_jacobian = net(x, condition)
self.assertTupleEqual(z.numpy().shape, (4, 32 * 8))
self.assertTupleEqual(logs_det_jacobian.numpy().shape, (1,))
def test_inverse(self):
z = paddle.randn([4, 32 * 8 ])
condition = paddle.randn([4, 7, 32 * 8])
net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
net.eval()
with paddle.no_grad():
x = net.inverse(z, condition)
self.assertTupleEqual(x.numpy().shape, (4, 32 * 8))