Merge pull request #66 from iclementine/reborn
format code and discard opencc
This commit is contained in:
commit
fe7ddc2aaf
|
@ -228,6 +228,6 @@ Parakeet 同时提供了示例模型的训练好的参数,可从下表中获
|
||||||
|
|
||||||
正在开发中。
|
正在开发中。
|
||||||
|
|
||||||
## 版权和许可
|
## 版权和许可
|
||||||
|
|
||||||
Parakeet 以 [Apache-2.0 license](LICENSE) 提供。
|
Parakeet 以 [Apache-2.0 license](LICENSE) 提供。
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
# Configuration file for the Sphinx documentation builder.
|
# Configuration file for the Sphinx documentation builder.
|
||||||
#
|
#
|
||||||
# This file only contains a selection of the most common options. For a full
|
# This file only contains a selection of the most common options. For a full
|
||||||
|
@ -14,7 +28,6 @@
|
||||||
# import sys
|
# import sys
|
||||||
# sys.path.insert(0, os.path.abspath('.'))
|
# sys.path.insert(0, os.path.abspath('.'))
|
||||||
|
|
||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = 'parakeet'
|
project = 'parakeet'
|
||||||
|
@ -24,7 +37,6 @@ author = 'parakeet-developers'
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = '0.2'
|
release = '0.2'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|
||||||
# Add any Sphinx extension module names here, as strings. They can be
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
|
@ -33,7 +45,7 @@ release = '0.2'
|
||||||
extensions = [
|
extensions = [
|
||||||
'sphinx.ext.autodoc',
|
'sphinx.ext.autodoc',
|
||||||
'sphinx.ext.viewcode',
|
'sphinx.ext.viewcode',
|
||||||
"sphinx_rtd_theme",
|
"sphinx_rtd_theme",
|
||||||
'sphinx.ext.mathjax',
|
'sphinx.ext.mathjax',
|
||||||
'numpydoc',
|
'numpydoc',
|
||||||
]
|
]
|
||||||
|
@ -46,7 +58,6 @@ templates_path = ['_templates']
|
||||||
# This pattern also affects html_static_path and html_extra_path.
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
exclude_patterns = []
|
exclude_patterns = []
|
||||||
|
|
||||||
|
|
||||||
# -- Options for HTML output -------------------------------------------------
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
||||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
|
|
||||||
常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
|
常见的配置文件的格式有 `ini`, `yaml`, `toml`, `json` 等。
|
||||||
|
|
||||||
`ini`
|
`ini`
|
||||||
优点:简单,支持字符串插值等操作。
|
优点:简单,支持字符串插值等操作。
|
||||||
缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。
|
缺点:仅支持两层结构,值不带类型信息,解析的时候需要手动 cast。
|
||||||
|
|
||||||
|
@ -102,11 +102,3 @@ optional arguments:
|
||||||
--opts ... options to overwrite --config file and the default
|
--opts ... options to overwrite --config file and the default
|
||||||
config, passing in KEY VALUE pairs
|
config, passing in KEY VALUE pairs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。
|
一般来说,我们将一个 Dataset 的子类看作是数据集和实验的具体需求之间的适配器。
|
||||||
|
|
||||||
parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset.
|
parakeet 还提供了若干个高阶的 Dataset 类,用于从已有的 Dataset 产生新的 Dataset.
|
||||||
|
|
||||||
1. 用于字段组合的有 TupleDataset, DictDataset;
|
1. 用于字段组合的有 TupleDataset, DictDataset;
|
||||||
2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset;
|
2. 用于数据集切分合并的有 SliceDataset, SubsetDataset, ChainDataset;
|
||||||
|
@ -137,7 +137,7 @@ class Transform(object):
|
||||||
self.processor = AudioProcessor(
|
self.processor = AudioProcessor(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
n_fft=1024,
|
n_fft=1024,
|
||||||
win_length=1024,
|
win_length=1024,
|
||||||
hop_length=256,
|
hop_length=256,
|
||||||
f_max=8000)
|
f_max=8000)
|
||||||
self.normalizer = LogMagnitude()
|
self.normalizer = LogMagnitude()
|
||||||
|
@ -167,7 +167,7 @@ ljspeech = TransformDataset(meta, transform)
|
||||||
|
|
||||||
当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。
|
当然也可以选择专门写一个转换脚本把转换后的数据集保存下来,然后再写一个适配的 Dataset 子类去加载这些保存的数据。实际这么做的效率会更高。
|
||||||
|
|
||||||
接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding.
|
接下来我们需要写一个可调用对象将多个样例组成批次。因为其中的 ids 和 mel 频谱是序列数据,所以我们需要进行 padding.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class LJSpeechCollector(object):
|
class LJSpeechCollector(object):
|
||||||
|
@ -197,10 +197,10 @@ def create_dataloader(source_path, valid_size, batch_size):
|
||||||
|
|
||||||
valid_set, train_set = dataset.split(lj, valid_size)
|
valid_set, train_set = dataset.split(lj, valid_size)
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
train_set,
|
train_set,
|
||||||
return_list=False,
|
return_list=False,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
collate_fn=LJSpeechCollector())
|
collate_fn=LJSpeechCollector())
|
||||||
valid_loader = DataLoader(
|
valid_loader = DataLoader(
|
||||||
|
|
|
@ -72,4 +72,4 @@ def train(self):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
exp.run()
|
exp.run()
|
||||||
```
|
```
|
||||||
|
|
|
@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+
|
||||||
```
|
```
|
||||||
|
|
||||||
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
|
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ python -m pip install paddlepaddle==2.0.0rc0 -i https://mirror.baidu.com/pypi/si
|
||||||
# ubuntu, debian
|
# ubuntu, debian
|
||||||
sudo apt-get install libsndfile1
|
sudo apt-get install libsndfile1
|
||||||
|
|
||||||
# centos, fedora,
|
# centos, fedora,
|
||||||
sudo yum install libsndfile
|
sudo yum install libsndfile
|
||||||
|
|
||||||
# openSUSE
|
# openSUSE
|
||||||
|
|
|
@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了
|
||||||
1. 可复用的模型以及常用的模块;
|
1. 可复用的模型以及常用的模块;
|
||||||
2. 从数据处理,模型训练到预测等一系列过程的完整实验;
|
2. 从数据处理,模型训练到预测等一系列过程的完整实验;
|
||||||
3. 高质量的开箱即用模型。
|
3. 高质量的开箱即用模型。
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,34 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from yacs.config import CfgNode as CN
|
from yacs.config import CfgNode as CN
|
||||||
|
|
||||||
_C = CN()
|
_C = CN()
|
||||||
_C.data = CN(
|
_C.data = CN(
|
||||||
dict(
|
dict(
|
||||||
batch_size=16, # batch size
|
batch_size=16, # batch size
|
||||||
valid_size=64, # the first N examples are reserved for validation
|
valid_size=64, # the first N examples are reserved for validation
|
||||||
sample_rate=22050, # Hz, sample rate
|
sample_rate=22050, # Hz, sample rate
|
||||||
n_fft=1024, # fft frame size
|
n_fft=1024, # fft frame size
|
||||||
win_length=1024, # window size
|
win_length=1024, # window size
|
||||||
hop_length=256, # hop size between ajacent frame
|
hop_length=256, # hop size between ajacent frame
|
||||||
f_max=8000, # Hz, max frequency when converting to mel
|
f_max=8000, # Hz, max frequency when converting to mel
|
||||||
d_mel=80, # mel bands
|
d_mel=80, # mel bands
|
||||||
padding_idx=0, # text embedding's padding index
|
padding_idx=0, # text embedding's padding index
|
||||||
mel_start_value=0.5, # value for starting frame
|
mel_start_value=0.5, # value for starting frame
|
||||||
mel_end_value=-0.5, # # value for ending frame
|
mel_end_value=-0.5, # # value for ending frame
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
_C.model = CN(
|
_C.model = CN(
|
||||||
dict(
|
dict(
|
||||||
|
@ -31,22 +44,21 @@ _C.model = CN(
|
||||||
postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size
|
postnet_kernel_size=5, # decoder postnet(cnn)'s kernel size
|
||||||
max_reduction_factor=10, # max_reduction factor
|
max_reduction_factor=10, # max_reduction factor
|
||||||
dropout=0.1, # global droput probability
|
dropout=0.1, # global droput probability
|
||||||
stop_loss_scale=8.0, # scaler for stop _loss
|
stop_loss_scale=8.0, # scaler for stop _loss
|
||||||
decoder_prenet_dropout=0.5, # decoder prenet dropout probability
|
decoder_prenet_dropout=0.5, # decoder prenet dropout probability
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
_C.training = CN(
|
_C.training = CN(
|
||||||
dict(
|
dict(
|
||||||
lr=1e-4, # learning rate
|
lr=1e-4, # learning rate
|
||||||
drop_n_heads=[[0, 0], [15000, 1]],
|
drop_n_heads=[[0, 0], [15000, 1]],
|
||||||
reduction_factor=[[0, 10], [80000, 4], [200000, 2]],
|
reduction_factor=[[0, 10], [80000, 4], [200000, 2]],
|
||||||
plot_interval=1000, # plot attention and spectrogram
|
plot_interval=1000, # plot attention and spectrogram
|
||||||
valid_interval=1000, # validation
|
valid_interval=1000, # validation
|
||||||
save_interval=10000, # checkpoint
|
save_interval=10000, # checkpoint
|
||||||
max_iteration=900000, # max iteration to train
|
max_iteration=900000, # max iteration to train
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
def get_cfg_defaults():
|
def get_cfg_defaults():
|
||||||
"""Get a yacs CfgNode object with default values for my_project."""
|
"""Get a yacs CfgNode object with default values for my_project."""
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader
|
||||||
from parakeet.data.batch import batch_spec, batch_text_id
|
from parakeet.data.batch import batch_spec, batch_text_id
|
||||||
from parakeet.data import dataset
|
from parakeet.data import dataset
|
||||||
|
|
||||||
|
|
||||||
class LJSpeech(Dataset):
|
class LJSpeech(Dataset):
|
||||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||||
|
|
||||||
def __init__(self, root):
|
def __init__(self, root):
|
||||||
self.root = Path(root).expanduser()
|
self.root = Path(root).expanduser()
|
||||||
records = []
|
records = []
|
||||||
|
@ -35,13 +51,13 @@ class Transform(object):
|
||||||
self.end_value = end_value
|
self.end_value = end_value
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, example):
|
||||||
ids, mel = example # ids already have <s> and </s>
|
ids, mel = example # ids already have <s> and </s>
|
||||||
ids = np.array(ids, dtype=np.int64)
|
ids = np.array(ids, dtype=np.int64)
|
||||||
# add start and end frame
|
# add start and end frame
|
||||||
mel = np.pad(mel,
|
mel = np.pad(
|
||||||
[(0, 0), (1, 1)],
|
mel, [(0, 0), (1, 1)],
|
||||||
mode='constant',
|
mode='constant',
|
||||||
constant_values=[(0, 0), (self.start_value, self.end_value)])
|
constant_values=[(0, 0), (self.start_value, self.end_value)])
|
||||||
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
|
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
|
||||||
stop_labels[-1] = 2
|
stop_labels[-1] = 2
|
||||||
# actually this thing can also be done within the model
|
# actually this thing can also be done within the model
|
||||||
|
@ -50,6 +66,7 @@ class Transform(object):
|
||||||
|
|
||||||
class LJSpeechCollector(object):
|
class LJSpeechCollector(object):
|
||||||
"""A simple callable to batch LJSpeech examples."""
|
"""A simple callable to batch LJSpeech examples."""
|
||||||
|
|
||||||
def __init__(self, padding_idx=0, padding_value=0.):
|
def __init__(self, padding_idx=0, padding_value=0.):
|
||||||
self.padding_idx = padding_idx
|
self.padding_idx = padding_idx
|
||||||
self.padding_value = padding_value
|
self.padding_value = padding_value
|
||||||
|
@ -67,15 +84,16 @@ class LJSpeechCollector(object):
|
||||||
|
|
||||||
def create_dataloader(config, source_path):
|
def create_dataloader(config, source_path):
|
||||||
lj = LJSpeech(source_path)
|
lj = LJSpeech(source_path)
|
||||||
transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
|
transform = Transform(config.data.mel_start_value,
|
||||||
|
config.data.mel_end_value)
|
||||||
lj = dataset.TransformDataset(lj, transform)
|
lj = dataset.TransformDataset(lj, transform)
|
||||||
|
|
||||||
valid_set, train_set = dataset.split(lj, config.data.valid_size)
|
valid_set, train_set = dataset.split(lj, config.data.valid_size)
|
||||||
data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
data_collator = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
collate_fn=data_collator)
|
collate_fn=data_collator)
|
||||||
valid_loader = DataLoader(
|
valid_loader = DataLoader(
|
||||||
|
@ -85,4 +103,3 @@ def create_dataloader(config, source_path):
|
||||||
drop_last=False,
|
drop_last=False,
|
||||||
collate_fn=data_collator)
|
collate_fn=data_collator)
|
||||||
return train_loader, valid_loader
|
return train_loader, valid_loader
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -11,6 +25,7 @@ from parakeet.frontend import English
|
||||||
|
|
||||||
from config import get_cfg_defaults
|
from config import get_cfg_defaults
|
||||||
|
|
||||||
|
|
||||||
def create_dataset(config, source_path, target_path, verbose=False):
|
def create_dataset(config, source_path, target_path, verbose=False):
|
||||||
# create output dir
|
# create output dir
|
||||||
target_path = Path(target_path).expanduser()
|
target_path = Path(target_path).expanduser()
|
||||||
|
@ -23,11 +38,11 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
||||||
sample_rate=config.data.sample_rate,
|
sample_rate=config.data.sample_rate,
|
||||||
n_fft=config.data.n_fft,
|
n_fft=config.data.n_fft,
|
||||||
n_mels=config.data.d_mel,
|
n_mels=config.data.d_mel,
|
||||||
win_length=config.data.win_length,
|
win_length=config.data.win_length,
|
||||||
hop_length=config.data.hop_length,
|
hop_length=config.data.hop_length,
|
||||||
f_max=config.data.f_max)
|
f_max=config.data.f_max)
|
||||||
normalizer = LogMagnitude()
|
normalizer = LogMagnitude()
|
||||||
|
|
||||||
records = []
|
records = []
|
||||||
for (fname, text, _) in tqdm.tqdm(meta_data):
|
for (fname, text, _) in tqdm.tqdm(meta_data):
|
||||||
wav = processor.read_wav(fname)
|
wav = processor.read_wav(fname)
|
||||||
|
@ -42,12 +57,13 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
||||||
np.save(mel_path / mel_name, mel)
|
np.save(mel_path / mel_name, mel)
|
||||||
if verbose:
|
if verbose:
|
||||||
print("save mel spectrograms into {}".format(mel_path))
|
print("save mel spectrograms into {}".format(mel_path))
|
||||||
|
|
||||||
# save meta data as pickle archive
|
# save meta data as pickle archive
|
||||||
with open(target_path / "metadata.pkl", 'wb') as f:
|
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||||
pickle.dump(records, f)
|
pickle.dump(records, f)
|
||||||
if verbose:
|
if verbose:
|
||||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
print("saved metadata into {}".format(target_path /
|
||||||
|
"metadata.pkl"))
|
||||||
|
|
||||||
# also save meta data into text format for inspection
|
# also save meta data into text format for inspection
|
||||||
with open(target_path / "metadata.txt", 'wt') as f:
|
with open(target_path / "metadata.txt", 'wt') as f:
|
||||||
|
@ -55,21 +71,31 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
||||||
phoneme_str = "|".join(phonemes)
|
phoneme_str = "|".join(phonemes)
|
||||||
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
|
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
|
||||||
if verbose:
|
if verbose:
|
||||||
print("saved metadata into {}".format(target_path / "metadata.txt"))
|
print("saved metadata into {}".format(target_path /
|
||||||
|
"metadata.txt"))
|
||||||
|
|
||||||
print("Done.")
|
print("Done.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="create dataset")
|
parser = argparse.ArgumentParser(description="create dataset")
|
||||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
parser.add_argument(
|
||||||
parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
|
"--config",
|
||||||
parser.add_argument("--output", type=str, help="path to save output dataset")
|
type=str,
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER,
|
metavar="FILE",
|
||||||
|
help="extra config to overwrite the default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--input", type=str, help="path of the ljspeech dataset")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=str, help="path to save output dataset")
|
||||||
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
)
|
)
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
parser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="print msg")
|
||||||
|
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -13,21 +27,22 @@ from parakeet.utils.display import add_attention_plots
|
||||||
|
|
||||||
from config import get_cfg_defaults
|
from config import get_cfg_defaults
|
||||||
|
|
||||||
|
|
||||||
@paddle.fluid.dygraph.no_grad
|
@paddle.fluid.dygraph.no_grad
|
||||||
def main(config, args):
|
def main(config, args):
|
||||||
paddle.set_device(args.device)
|
paddle.set_device(args.device)
|
||||||
|
|
||||||
# model
|
# model
|
||||||
frontend = English()
|
frontend = English()
|
||||||
model = TransformerTTS.from_pretrained(
|
model = TransformerTTS.from_pretrained(frontend, config,
|
||||||
frontend, config, args.checkpoint_path)
|
args.checkpoint_path)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# inputs
|
# inputs
|
||||||
input_path = Path(args.input).expanduser()
|
input_path = Path(args.input).expanduser()
|
||||||
with open(input_path, "rt") as f:
|
with open(input_path, "rt") as f:
|
||||||
sentences = f.readlines()
|
sentences = f.readlines()
|
||||||
|
|
||||||
output_dir = Path(args.output).expanduser()
|
output_dir = Path(args.output).expanduser()
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
@ -38,22 +53,36 @@ def main(config, args):
|
||||||
mel_output = mel_output.T #(C, T)
|
mel_output = mel_output.T #(C, T)
|
||||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy"))
|
print("spectrogram saved at {}".format(output_dir /
|
||||||
|
f"sentence_{i}.npy"))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
description="generate mel spectrogram with TransformerTTS.")
|
||||||
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
type=str,
|
||||||
|
metavar="FILE",
|
||||||
|
help="extra config to overwrite the default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||||
parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
|
parser.add_argument(
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
"--device", type=str, default="cpu", help="device type to use.")
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
|
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="print msg")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
config.merge_from_file(args.config)
|
config.merge_from_file(args.config)
|
||||||
if args.opts:
|
if args.opts:
|
||||||
config.merge_from_list(args.opts)
|
config.merge_from_list(args.opts)
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -19,12 +33,13 @@ from parakeet.training.experiment import ExperimentBase
|
||||||
from config import get_cfg_defaults
|
from config import get_cfg_defaults
|
||||||
from ljspeech import LJSpeech, LJSpeechCollector, Transform
|
from ljspeech import LJSpeech, LJSpeechCollector, Transform
|
||||||
|
|
||||||
|
|
||||||
class Experiment(ExperimentBase):
|
class Experiment(ExperimentBase):
|
||||||
def setup_model(self):
|
def setup_model(self):
|
||||||
config = self.config
|
config = self.config
|
||||||
frontend = English()
|
frontend = English()
|
||||||
model = TransformerTTS(
|
model = TransformerTTS(
|
||||||
frontend,
|
frontend,
|
||||||
d_encoder=config.model.d_encoder,
|
d_encoder=config.model.d_encoder,
|
||||||
d_decoder=config.model.d_decoder,
|
d_decoder=config.model.d_decoder,
|
||||||
d_mel=config.data.d_mel,
|
d_mel=config.data.d_mel,
|
||||||
|
@ -46,8 +61,7 @@ class Experiment(ExperimentBase):
|
||||||
beta1=0.9,
|
beta1=0.9,
|
||||||
beta2=0.98,
|
beta2=0.98,
|
||||||
epsilon=1e-9,
|
epsilon=1e-9,
|
||||||
parameters=model.parameters()
|
parameters=model.parameters())
|
||||||
)
|
|
||||||
criterion = TransformerTTSLoss(config.model.stop_loss_scale)
|
criterion = TransformerTTSLoss(config.model.stop_loss_scale)
|
||||||
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
||||||
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
||||||
|
@ -63,21 +77,24 @@ class Experiment(ExperimentBase):
|
||||||
config = self.config
|
config = self.config
|
||||||
|
|
||||||
ljspeech_dataset = LJSpeech(args.data)
|
ljspeech_dataset = LJSpeech(args.data)
|
||||||
transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
|
transform = Transform(config.data.mel_start_value,
|
||||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
|
config.data.mel_end_value)
|
||||||
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
|
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
|
||||||
|
transform)
|
||||||
|
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||||
|
config.data.valid_size)
|
||||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||||
|
|
||||||
if not self.parallel:
|
if not self.parallel:
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
collate_fn=batch_fn)
|
collate_fn=batch_fn)
|
||||||
else:
|
else:
|
||||||
sampler = DistributedBatchSampler(
|
sampler = DistributedBatchSampler(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
num_replicas=dist.get_world_size(),
|
num_replicas=dist.get_world_size(),
|
||||||
rank=dist.get_rank(),
|
rank=dist.get_rank(),
|
||||||
|
@ -95,11 +112,11 @@ class Experiment(ExperimentBase):
|
||||||
def compute_outputs(self, text, mel, stop_label):
|
def compute_outputs(self, text, mel, stop_label):
|
||||||
model_core = self.model._layers if self.parallel else self.model
|
model_core = self.model._layers if self.parallel else self.model
|
||||||
model_core.set_constants(
|
model_core.set_constants(
|
||||||
self.reduction_factor(self.iteration),
|
self.reduction_factor(self.iteration),
|
||||||
self.drop_n_heads(self.iteration))
|
self.drop_n_heads(self.iteration))
|
||||||
|
|
||||||
# TODO(chenfeiyu): we can combine these 2 slices
|
# TODO(chenfeiyu): we can combine these 2 slices
|
||||||
mel_input = mel[:,:-1, :]
|
mel_input = mel[:, :-1, :]
|
||||||
reduced_mel_input = mel_input[:, ::model_core.r, :]
|
reduced_mel_input = mel_input[:, ::model_core.r, :]
|
||||||
outputs = self.model(text, reduced_mel_input)
|
outputs = self.model(text, reduced_mel_input)
|
||||||
return outputs
|
return outputs
|
||||||
|
@ -115,11 +132,8 @@ class Experiment(ExperimentBase):
|
||||||
|
|
||||||
time_steps = mel_target.shape[1]
|
time_steps = mel_target.shape[1]
|
||||||
losses = self.criterion(
|
losses = self.criterion(
|
||||||
mel_output[:,:time_steps, :],
|
mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
|
||||||
mel_intermediate[:,:time_steps, :],
|
mel_target, stop_logits[:, :time_steps, :], stop_label_target)
|
||||||
mel_target,
|
|
||||||
stop_logits[:,:time_steps, :],
|
|
||||||
stop_label_target)
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def train_batch(self):
|
def train_batch(self):
|
||||||
|
@ -133,7 +147,7 @@ class Experiment(ExperimentBase):
|
||||||
outputs = self.compute_outputs(text, mel, stop_label)
|
outputs = self.compute_outputs(text, mel, stop_label)
|
||||||
losses = self.compute_losses(batch, outputs)
|
losses = self.compute_losses(batch, outputs)
|
||||||
loss = losses["loss"]
|
loss = losses["loss"]
|
||||||
loss.backward()
|
loss.backward()
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
iteration_time = time.time() - start
|
iteration_time = time.time() - start
|
||||||
|
|
||||||
|
@ -141,14 +155,17 @@ class Experiment(ExperimentBase):
|
||||||
# logging
|
# logging
|
||||||
msg = "Rank: {}, ".format(dist.get_rank())
|
msg = "Rank: {}, ".format(dist.get_rank())
|
||||||
msg += "step: {}, ".format(self.iteration)
|
msg += "step: {}, ".format(self.iteration)
|
||||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
|
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||||
msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items())
|
iteration_time)
|
||||||
|
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||||
|
for k, v in losses_np.items())
|
||||||
self.logger.info(msg)
|
self.logger.info(msg)
|
||||||
|
|
||||||
if dist.get_rank() == 0:
|
if dist.get_rank() == 0:
|
||||||
for k, v in losses_np.items():
|
for k, v in losses_np.items():
|
||||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||||
|
self.iteration)
|
||||||
|
|
||||||
@mp_tools.rank_zero_only
|
@mp_tools.rank_zero_only
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
def valid(self):
|
def valid(self):
|
||||||
|
@ -163,10 +180,9 @@ class Experiment(ExperimentBase):
|
||||||
if i < 2:
|
if i < 2:
|
||||||
attention_weights = outputs["cross_attention_weights"]
|
attention_weights = outputs["cross_attention_weights"]
|
||||||
display.add_multi_attention_plots(
|
display.add_multi_attention_plots(
|
||||||
self.visualizer,
|
self.visualizer,
|
||||||
f"valid_sentence_{i}_cross_attention_weights",
|
f"valid_sentence_{i}_cross_attention_weights",
|
||||||
attention_weights,
|
attention_weights, self.iteration)
|
||||||
self.iteration)
|
|
||||||
|
|
||||||
# write visual log
|
# write visual log
|
||||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||||
|
@ -191,7 +207,7 @@ if __name__ == "__main__":
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
parser = default_argument_parser()
|
parser = default_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
config.merge_from_file(args.config)
|
config.merge_from_file(args.config)
|
||||||
if args.opts:
|
if args.opts:
|
||||||
config.merge_from_list(args.opts)
|
config.merge_from_list(args.opts)
|
||||||
|
|
|
@ -1,40 +1,52 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from yacs.config import CfgNode as CN
|
from yacs.config import CfgNode as CN
|
||||||
|
|
||||||
_C = CN()
|
_C = CN()
|
||||||
_C.data = CN(
|
_C.data = CN(
|
||||||
dict(
|
dict(
|
||||||
batch_size=8, # batch size
|
batch_size=8, # batch size
|
||||||
valid_size=16, # the first N examples are reserved for validation
|
valid_size=16, # the first N examples are reserved for validation
|
||||||
sample_rate=22050, # Hz, sample rate
|
sample_rate=22050, # Hz, sample rate
|
||||||
n_fft=1024, # fft frame size
|
n_fft=1024, # fft frame size
|
||||||
win_length=1024, # window size
|
win_length=1024, # window size
|
||||||
hop_length=256, # hop size between ajacent frame
|
hop_length=256, # hop size between ajacent frame
|
||||||
f_max=8000, # Hz, max frequency when converting to mel
|
f_max=8000, # Hz, max frequency when converting to mel
|
||||||
n_mels=80, # mel bands
|
n_mels=80, # mel bands
|
||||||
clip_frames=65, # mel clip frames
|
clip_frames=65, # mel clip frames
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
_C.model = CN(
|
_C.model = CN(
|
||||||
dict(
|
dict(
|
||||||
upsample_factors=[16, 16],
|
upsample_factors=[16, 16],
|
||||||
n_flows=8, # number of flows in WaveFlow
|
n_flows=8, # number of flows in WaveFlow
|
||||||
n_layers=8, # number of conv block in each flow
|
n_layers=8, # number of conv block in each flow
|
||||||
n_group=16, # folding factor of audio and spectrogram
|
n_group=16, # folding factor of audio and spectrogram
|
||||||
channels=128, # resiaudal channel in each flow
|
channels=128, # resiaudal channel in each flow
|
||||||
kernel_size=[3, 3], # kernel size in each conv block
|
kernel_size=[3, 3], # kernel size in each conv block
|
||||||
sigma=1.0, # stddev of the random noise
|
sigma=1.0, # stddev of the random noise
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
_C.training = CN(
|
_C.training = CN(
|
||||||
dict(
|
dict(
|
||||||
lr=2e-4, # learning rates
|
lr=2e-4, # learning rates
|
||||||
valid_interval=1000, # validation
|
valid_interval=1000, # validation
|
||||||
save_interval=10000, # checkpoint
|
save_interval=10000, # checkpoint
|
||||||
max_iteration=3000000, # max iteration to train
|
max_iteration=3000000, # max iteration to train
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
def get_cfg_defaults():
|
def get_cfg_defaults():
|
||||||
"""Get a yacs CfgNode object with default values for my_project."""
|
"""Get a yacs CfgNode object with default values for my_project."""
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
|
||||||
from parakeet.data import dataset
|
from parakeet.data import dataset
|
||||||
from parakeet.audio import AudioProcessor
|
from parakeet.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
class LJSpeech(Dataset):
|
class LJSpeech(Dataset):
|
||||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||||
|
|
||||||
def __init__(self, root):
|
def __init__(self, root):
|
||||||
self.root = Path(root).expanduser()
|
self.root = Path(root).expanduser()
|
||||||
meta_data = pandas.read_csv(
|
meta_data = pandas.read_csv(
|
||||||
str(self.root / "metadata.csv"),
|
str(self.root / "metadata.csv"),
|
||||||
sep="\t",
|
sep="\t",
|
||||||
header=None,
|
header=None,
|
||||||
names=["fname", "frames", "samples"]
|
names=["fname", "frames", "samples"])
|
||||||
)
|
|
||||||
|
|
||||||
records = []
|
records = []
|
||||||
for row in meta_data.itertuples() :
|
for row in meta_data.itertuples():
|
||||||
mel_path = str(self.root / "mel" / (row.fname + ".npy"))
|
mel_path = str(self.root / "mel" / (row.fname + ".npy"))
|
||||||
wav_path = str(self.root / "wav" / (row.fname + ".npy"))
|
wav_path = str(self.root / "wav" / (row.fname + ".npy"))
|
||||||
records.append((mel_path, wav_path))
|
records.append((mel_path, wav_path))
|
||||||
|
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
|
||||||
|
|
||||||
class LJSpeechCollector(object):
|
class LJSpeechCollector(object):
|
||||||
"""A simple callable to batch LJSpeech examples."""
|
"""A simple callable to batch LJSpeech examples."""
|
||||||
|
|
||||||
def __init__(self, padding_value=0.):
|
def __init__(self, padding_value=0.):
|
||||||
self.padding_value = padding_value
|
self.padding_value = padding_value
|
||||||
|
|
||||||
|
@ -52,9 +68,9 @@ class LJSpeechCollector(object):
|
||||||
|
|
||||||
class LJSpeechClipCollector(object):
|
class LJSpeechClipCollector(object):
|
||||||
def __init__(self, clip_frames=65, hop_length=256):
|
def __init__(self, clip_frames=65, hop_length=256):
|
||||||
self.clip_frames = clip_frames
|
self.clip_frames = clip_frames
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
|
|
||||||
def __call__(self, examples):
|
def __call__(self, examples):
|
||||||
mels = []
|
mels = []
|
||||||
wavs = []
|
wavs = []
|
||||||
|
@ -70,9 +86,7 @@ class LJSpeechClipCollector(object):
|
||||||
mel, wav = example
|
mel, wav = example
|
||||||
frames = mel.shape[-1]
|
frames = mel.shape[-1]
|
||||||
start = np.random.randint(0, frames - self.clip_frames)
|
start = np.random.randint(0, frames - self.clip_frames)
|
||||||
mel_clip = mel[:, start: start + self.clip_frames]
|
mel_clip = mel[:, start:start + self.clip_frames]
|
||||||
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
|
wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
|
||||||
|
self.hop_length]
|
||||||
return mel_clip, wav_clip
|
return mel_clip, wav_clip
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
import csv
|
import csv
|
||||||
|
@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
||||||
output_dir = Path(output_dir).expanduser()
|
output_dir = Path(output_dir).expanduser()
|
||||||
output_dir.mkdir(exist_ok=True)
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
transform = Transform(
|
transform = Transform(config.sample_rate, config.n_fft, config.win_length,
|
||||||
config.sample_rate,
|
config.hop_length, config.n_mels)
|
||||||
config.n_fft,
|
|
||||||
config.win_length,
|
|
||||||
config.hop_length,
|
|
||||||
config.n_mels)
|
|
||||||
file_names = []
|
file_names = []
|
||||||
|
|
||||||
for example in tqdm.tqdm(dataset):
|
for example in tqdm.tqdm(dataset):
|
||||||
|
@ -107,23 +117,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
||||||
np.save(str(mel_dir / base_name), mel)
|
np.save(str(mel_dir / base_name), mel)
|
||||||
|
|
||||||
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
||||||
|
|
||||||
meta_data = pd.DataFrame.from_records(file_names)
|
meta_data = pd.DataFrame.from_records(file_names)
|
||||||
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
meta_data.to_csv(
|
||||||
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
|
str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||||
|
print("saved meta data in to {}".format(
|
||||||
|
os.path.join(output_dir, "metadata.csv")))
|
||||||
|
|
||||||
print("Done!")
|
print("Done!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="create dataset")
|
parser = argparse.ArgumentParser(description="create dataset")
|
||||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
parser.add_argument(
|
||||||
parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
|
"--config",
|
||||||
parser.add_argument("--output", type=str, help="path to save output dataset")
|
type=str,
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER,
|
metavar="FILE",
|
||||||
|
help="extra config to overwrite the default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--input", type=str, help="path of the ljspeech dataset")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=str, help="path to save output dataset")
|
||||||
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
)
|
)
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
parser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="print msg")
|
||||||
|
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
@ -8,9 +22,9 @@ import parakeet
|
||||||
from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
|
from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
|
||||||
from parakeet.utils import layer_tools, checkpoint
|
from parakeet.utils import layer_tools, checkpoint
|
||||||
|
|
||||||
|
|
||||||
from config import get_cfg_defaults
|
from config import get_cfg_defaults
|
||||||
|
|
||||||
|
|
||||||
def main(config, args):
|
def main(config, args):
|
||||||
paddle.set_device(args.device)
|
paddle.set_device(args.device)
|
||||||
model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
|
model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
|
||||||
|
@ -23,7 +37,8 @@ def main(config, args):
|
||||||
for file_path in mel_dir.iterdir():
|
for file_path in mel_dir.iterdir():
|
||||||
mel = np.load(str(file_path))
|
mel = np.load(str(file_path))
|
||||||
audio = model.predict(mel)
|
audio = model.predict(mel)
|
||||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
|
audio_path = output_dir / (
|
||||||
|
os.path.splitext(file_path.name)[0] + ".wav")
|
||||||
sf.write(audio_path, audio, config.data.sample_rate)
|
sf.write(audio_path, audio, config.data.sample_rate)
|
||||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||||
|
|
||||||
|
@ -31,17 +46,32 @@ def main(config, args):
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
description="generate mel spectrogram with TransformerTTS.")
|
||||||
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
parser.add_argument(
|
||||||
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
|
"--config",
|
||||||
|
type=str,
|
||||||
|
metavar="FILE",
|
||||||
|
help="extra config to overwrite the default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
type=str,
|
||||||
|
help="path of directory containing mel spectrogram (in .npy format)")
|
||||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||||
parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
|
parser.add_argument(
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
"--device", type=str, default="cpu", help="device type to use.")
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
|
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="print msg")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
config.merge_from_file(args.config)
|
config.merge_from_file(args.config)
|
||||||
if args.opts:
|
if args.opts:
|
||||||
config.merge_from_list(args.opts)
|
config.merge_from_list(args.opts)
|
||||||
|
@ -49,4 +79,4 @@ if __name__ == "__main__":
|
||||||
print(config)
|
print(config)
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
main(config, args)
|
main(config, args)
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -34,7 +48,8 @@ class Experiment(ExperimentBase):
|
||||||
|
|
||||||
if self.parallel > 1:
|
if self.parallel > 1:
|
||||||
model = paddle.DataParallel(model)
|
model = paddle.DataParallel(model)
|
||||||
optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters())
|
optimizer = paddle.optimizer.Adam(
|
||||||
|
config.training.lr, parameters=model.parameters())
|
||||||
criterion = WaveFlowLoss(sigma=config.model.sigma)
|
criterion = WaveFlowLoss(sigma=config.model.sigma)
|
||||||
|
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -46,20 +61,22 @@ class Experiment(ExperimentBase):
|
||||||
args = self.args
|
args = self.args
|
||||||
|
|
||||||
ljspeech_dataset = LJSpeech(args.data)
|
ljspeech_dataset = LJSpeech(args.data)
|
||||||
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
|
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||||
|
config.data.valid_size)
|
||||||
|
|
||||||
|
batch_fn = LJSpeechClipCollector(config.data.clip_frames,
|
||||||
|
config.data.hop_length)
|
||||||
|
|
||||||
batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length)
|
|
||||||
|
|
||||||
if not self.parallel:
|
if not self.parallel:
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
collate_fn=batch_fn)
|
collate_fn=batch_fn)
|
||||||
else:
|
else:
|
||||||
sampler = DistributedBatchSampler(
|
sampler = DistributedBatchSampler(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
num_replicas=dist.get_world_size(),
|
num_replicas=dist.get_world_size(),
|
||||||
rank=dist.get_rank(),
|
rank=dist.get_rank(),
|
||||||
|
@ -71,7 +88,7 @@ class Experiment(ExperimentBase):
|
||||||
valid_batch_fn = LJSpeechCollector()
|
valid_batch_fn = LJSpeechCollector()
|
||||||
valid_loader = DataLoader(
|
valid_loader = DataLoader(
|
||||||
valid_set, batch_size=1, collate_fn=valid_batch_fn)
|
valid_set, batch_size=1, collate_fn=valid_batch_fn)
|
||||||
|
|
||||||
self.train_loader = train_loader
|
self.train_loader = train_loader
|
||||||
self.valid_loader = valid_loader
|
self.valid_loader = valid_loader
|
||||||
|
|
||||||
|
@ -90,17 +107,19 @@ class Experiment(ExperimentBase):
|
||||||
mel, wav = batch
|
mel, wav = batch
|
||||||
z, log_det_jocobian = self.compute_outputs(mel, wav)
|
z, log_det_jocobian = self.compute_outputs(mel, wav)
|
||||||
loss = self.criterion(z, log_det_jocobian)
|
loss = self.criterion(z, log_det_jocobian)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
iteration_time = time.time() - start
|
iteration_time = time.time() - start
|
||||||
|
|
||||||
loss_value = float(loss)
|
loss_value = float(loss)
|
||||||
msg = "Rank: {}, ".format(dist.get_rank())
|
msg = "Rank: {}, ".format(dist.get_rank())
|
||||||
msg += "step: {}, ".format(self.iteration)
|
msg += "step: {}, ".format(self.iteration)
|
||||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
|
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||||
|
iteration_time)
|
||||||
msg += "loss: {:>.6f}".format(loss_value)
|
msg += "loss: {:>.6f}".format(loss_value)
|
||||||
self.logger.info(msg)
|
self.logger.info(msg)
|
||||||
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
|
self.visualizer.add_scalar(
|
||||||
|
"train/loss", loss_value, global_step=self.iteration)
|
||||||
|
|
||||||
@mp_tools.rank_zero_only
|
@mp_tools.rank_zero_only
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
|
@ -112,7 +131,8 @@ class Experiment(ExperimentBase):
|
||||||
loss = self.criterion(z, log_det_jocobian)
|
loss = self.criterion(z, log_det_jocobian)
|
||||||
valid_losses.append(float(loss))
|
valid_losses.append(float(loss))
|
||||||
valid_loss = np.mean(valid_losses)
|
valid_loss = np.mean(valid_losses)
|
||||||
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
|
self.visualizer.add_scalar(
|
||||||
|
"valid/loss", valid_loss, global_step=self.iteration)
|
||||||
|
|
||||||
|
|
||||||
def main_sp(config, args):
|
def main_sp(config, args):
|
||||||
|
@ -132,7 +152,7 @@ if __name__ == "__main__":
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
parser = default_argument_parser()
|
parser = default_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
config.merge_from_file(args.config)
|
config.merge_from_file(args.config)
|
||||||
if args.opts:
|
if args.opts:
|
||||||
config.merge_from_list(args.opts)
|
config.merge_from_list(args.opts)
|
||||||
|
|
|
@ -1,19 +1,32 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from yacs.config import CfgNode as CN
|
from yacs.config import CfgNode as CN
|
||||||
|
|
||||||
_C = CN()
|
_C = CN()
|
||||||
_C.data = CN(
|
_C.data = CN(
|
||||||
dict(
|
dict(
|
||||||
batch_size=8, # batch size
|
batch_size=8, # batch size
|
||||||
valid_size=16, # the first N examples are reserved for validation
|
valid_size=16, # the first N examples are reserved for validation
|
||||||
sample_rate=22050, # Hz, sample rate
|
sample_rate=22050, # Hz, sample rate
|
||||||
n_fft=2048, # fft frame size
|
n_fft=2048, # fft frame size
|
||||||
win_length=1024, # window size
|
win_length=1024, # window size
|
||||||
hop_length=256, # hop size between ajacent frame
|
hop_length=256, # hop size between ajacent frame
|
||||||
# f_max=8000, # Hz, max frequency when converting to mel
|
# f_max=8000, # Hz, max frequency when converting to mel
|
||||||
n_mels=80, # mel bands
|
n_mels=80, # mel bands
|
||||||
train_clip_seconds=0.5, # audio clip length(in seconds)
|
train_clip_seconds=0.5, # audio clip length(in seconds)
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
_C.model = CN(
|
_C.model = CN(
|
||||||
dict(
|
dict(
|
||||||
|
@ -21,24 +34,22 @@ _C.model = CN(
|
||||||
n_stack=3,
|
n_stack=3,
|
||||||
n_loop=10,
|
n_loop=10,
|
||||||
filter_size=2,
|
filter_size=2,
|
||||||
residual_channels=128, # resiaudal channel in each flow
|
residual_channels=128, # resiaudal channel in each flow
|
||||||
loss_type="mog",
|
loss_type="mog",
|
||||||
output_dim=3, # single gaussian
|
output_dim=3, # single gaussian
|
||||||
log_scale_min=-9.0,
|
log_scale_min=-9.0, ))
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
_C.training = CN(
|
_C.training = CN(
|
||||||
dict(
|
dict(
|
||||||
lr=1e-3, # learning rates
|
lr=1e-3, # learning rates
|
||||||
anneal_rate=0.5, # learning rate decay rate
|
anneal_rate=0.5, # learning rate decay rate
|
||||||
anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps
|
anneal_interval=200000, # decrese lr by annel_rate every anneal_interval steps
|
||||||
valid_interval=1000, # validation
|
valid_interval=1000, # validation
|
||||||
save_interval=10000, # checkpoint
|
save_interval=10000, # checkpoint
|
||||||
max_iteration=3000000, # max iteration to train
|
max_iteration=3000000, # max iteration to train
|
||||||
gradient_max_norm=100.0 # global norm of gradients
|
gradient_max_norm=100.0 # global norm of gradients
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
def get_cfg_defaults():
|
def get_cfg_defaults():
|
||||||
"""Get a yacs CfgNode object with default values for my_project."""
|
"""Get a yacs CfgNode object with default values for my_project."""
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
|
||||||
from parakeet.data import dataset
|
from parakeet.data import dataset
|
||||||
from parakeet.audio import AudioProcessor
|
from parakeet.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
class LJSpeech(Dataset):
|
class LJSpeech(Dataset):
|
||||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||||
|
|
||||||
def __init__(self, root):
|
def __init__(self, root):
|
||||||
self.root = Path(root).expanduser()
|
self.root = Path(root).expanduser()
|
||||||
meta_data = pandas.read_csv(
|
meta_data = pandas.read_csv(
|
||||||
str(self.root / "metadata.csv"),
|
str(self.root / "metadata.csv"),
|
||||||
sep="\t",
|
sep="\t",
|
||||||
header=None,
|
header=None,
|
||||||
names=["fname", "frames", "samples"]
|
names=["fname", "frames", "samples"])
|
||||||
)
|
|
||||||
|
|
||||||
records = []
|
records = []
|
||||||
for row in meta_data.itertuples() :
|
for row in meta_data.itertuples():
|
||||||
mel_path = str(self.root / "mel" / (row.fname + ".npy"))
|
mel_path = str(self.root / "mel" / (row.fname + ".npy"))
|
||||||
wav_path = str(self.root / "wav" / (row.fname + ".npy"))
|
wav_path = str(self.root / "wav" / (row.fname + ".npy"))
|
||||||
records.append((mel_path, wav_path))
|
records.append((mel_path, wav_path))
|
||||||
|
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
|
||||||
|
|
||||||
class LJSpeechCollector(object):
|
class LJSpeechCollector(object):
|
||||||
"""A simple callable to batch LJSpeech examples."""
|
"""A simple callable to batch LJSpeech examples."""
|
||||||
|
|
||||||
def __init__(self, padding_value=0.):
|
def __init__(self, padding_value=0.):
|
||||||
self.padding_value = padding_value
|
self.padding_value = padding_value
|
||||||
|
|
||||||
|
@ -48,15 +64,15 @@ class LJSpeechCollector(object):
|
||||||
wavs = [example[1] for example in examples]
|
wavs = [example[1] for example in examples]
|
||||||
mels = batch_spec(mels, pad_value=self.padding_value)
|
mels = batch_spec(mels, pad_value=self.padding_value)
|
||||||
wavs = batch_wav(wavs, pad_value=self.padding_value)
|
wavs = batch_wav(wavs, pad_value=self.padding_value)
|
||||||
audio_starts = np.zeros((batch_size,), dtype=np.int64)
|
audio_starts = np.zeros((batch_size, ), dtype=np.int64)
|
||||||
return mels, wavs, audio_starts
|
return mels, wavs, audio_starts
|
||||||
|
|
||||||
|
|
||||||
class LJSpeechClipCollector(object):
|
class LJSpeechClipCollector(object):
|
||||||
def __init__(self, clip_frames=65, hop_length=256):
|
def __init__(self, clip_frames=65, hop_length=256):
|
||||||
self.clip_frames = clip_frames
|
self.clip_frames = clip_frames
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
|
|
||||||
def __call__(self, examples):
|
def __call__(self, examples):
|
||||||
mels = []
|
mels = []
|
||||||
wavs = []
|
wavs = []
|
||||||
|
@ -75,7 +91,8 @@ class LJSpeechClipCollector(object):
|
||||||
mel, wav = example
|
mel, wav = example
|
||||||
frames = mel.shape[-1]
|
frames = mel.shape[-1]
|
||||||
start = np.random.randint(0, frames - self.clip_frames)
|
start = np.random.randint(0, frames - self.clip_frames)
|
||||||
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
|
wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
|
||||||
|
self.hop_length]
|
||||||
return mel, wav_clip, start
|
return mel, wav_clip, start
|
||||||
|
|
||||||
|
|
||||||
|
@ -132,7 +149,3 @@ class DataCollector(object):
|
||||||
audios = np.array(audios, dtype=np.float32)
|
audios = np.array(audios, dtype=np.float32)
|
||||||
audio_starts = np.array(audio_starts, dtype=np.int64)
|
audio_starts = np.array(audio_starts, dtype=np.int64)
|
||||||
return audios, mels, audio_starts
|
return audios, mels, audio_starts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
import csv
|
import csv
|
||||||
|
@ -23,7 +37,7 @@ class Transform(object):
|
||||||
self.win_length = win_length
|
self.win_length = win_length
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
self.n_mels = n_mels
|
self.n_mels = n_mels
|
||||||
|
|
||||||
self.spec_normalizer = UnitMagnitude(min=1e-5)
|
self.spec_normalizer = UnitMagnitude(min=1e-5)
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, example):
|
||||||
|
@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
||||||
output_dir = Path(output_dir).expanduser()
|
output_dir = Path(output_dir).expanduser()
|
||||||
output_dir.mkdir(exist_ok=True)
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
transform = Transform(
|
transform = Transform(config.sample_rate, config.n_fft, config.win_length,
|
||||||
config.sample_rate,
|
config.hop_length, config.n_mels)
|
||||||
config.n_fft,
|
|
||||||
config.win_length,
|
|
||||||
config.hop_length,
|
|
||||||
config.n_mels)
|
|
||||||
file_names = []
|
file_names = []
|
||||||
|
|
||||||
for example in tqdm.tqdm(dataset):
|
for example in tqdm.tqdm(dataset):
|
||||||
|
@ -108,23 +118,35 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
||||||
np.save(str(mel_dir / base_name), mel)
|
np.save(str(mel_dir / base_name), mel)
|
||||||
|
|
||||||
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
||||||
|
|
||||||
meta_data = pd.DataFrame.from_records(file_names)
|
meta_data = pd.DataFrame.from_records(file_names)
|
||||||
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
meta_data.to_csv(
|
||||||
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
|
str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||||
|
print("saved meta data in to {}".format(
|
||||||
|
os.path.join(output_dir, "metadata.csv")))
|
||||||
|
|
||||||
print("Done!")
|
print("Done!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="create dataset")
|
parser = argparse.ArgumentParser(description="create dataset")
|
||||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
parser.add_argument(
|
||||||
parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
|
"--config",
|
||||||
parser.add_argument("--output", type=str, help="path to save output dataset")
|
type=str,
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER,
|
metavar="FILE",
|
||||||
|
help="extra config to overwrite the default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--input", type=str, help="path of the ljspeech dataset")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=str, help="path to save output dataset")
|
||||||
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
)
|
)
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
parser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="print msg")
|
||||||
|
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint
|
||||||
|
|
||||||
from config import get_cfg_defaults
|
from config import get_cfg_defaults
|
||||||
|
|
||||||
|
|
||||||
def main(config, args):
|
def main(config, args):
|
||||||
paddle.set_device(args.device)
|
paddle.set_device(args.device)
|
||||||
model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
|
model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
|
||||||
|
@ -22,7 +37,8 @@ def main(config, args):
|
||||||
for file_path in mel_dir.iterdir():
|
for file_path in mel_dir.iterdir():
|
||||||
mel = np.load(str(file_path))
|
mel = np.load(str(file_path))
|
||||||
audio = model.predict(mel)
|
audio = model.predict(mel)
|
||||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
|
audio_path = output_dir / (
|
||||||
|
os.path.splitext(file_path.name)[0] + ".wav")
|
||||||
sf.write(audio_path, audio, config.data.sample_rate)
|
sf.write(audio_path, audio, config.data.sample_rate)
|
||||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||||
|
|
||||||
|
@ -30,17 +46,32 @@ def main(config, args):
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
description="generate mel spectrogram with TransformerTTS.")
|
||||||
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
parser.add_argument(
|
||||||
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
|
"--config",
|
||||||
|
type=str,
|
||||||
|
metavar="FILE",
|
||||||
|
help="extra config to overwrite the default config")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
type=str,
|
||||||
|
help="path of directory containing mel spectrogram (in .npy format)")
|
||||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||||
parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
|
parser.add_argument(
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
"--device", type=str, default="cpu", help="device type to use.")
|
||||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
parser.add_argument(
|
||||||
|
"--opts",
|
||||||
|
nargs=argparse.REMAINDER,
|
||||||
|
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="print msg")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
config.merge_from_file(args.config)
|
config.merge_from_file(args.config)
|
||||||
if args.opts:
|
if args.opts:
|
||||||
config.merge_from_list(args.opts)
|
config.merge_from_list(args.opts)
|
||||||
|
@ -48,4 +79,4 @@ if __name__ == "__main__":
|
||||||
print(config)
|
print(config)
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
main(config, args)
|
main(config, args)
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import math
|
import math
|
||||||
|
@ -26,7 +40,7 @@ class Experiment(ExperimentBase):
|
||||||
config = self.config
|
config = self.config
|
||||||
model = ConditionalWaveNet(
|
model = ConditionalWaveNet(
|
||||||
upsample_factors=config.model.upsample_factors,
|
upsample_factors=config.model.upsample_factors,
|
||||||
n_stack=config.model.n_stack,
|
n_stack=config.model.n_stack,
|
||||||
n_loop=config.model.n_loop,
|
n_loop=config.model.n_loop,
|
||||||
residual_channels=config.model.residual_channels,
|
residual_channels=config.model.residual_channels,
|
||||||
output_dim=config.model.output_dim,
|
output_dim=config.model.output_dim,
|
||||||
|
@ -39,13 +53,13 @@ class Experiment(ExperimentBase):
|
||||||
model = paddle.DataParallel(model)
|
model = paddle.DataParallel(model)
|
||||||
|
|
||||||
lr_scheduler = paddle.optimizer.lr.StepDecay(
|
lr_scheduler = paddle.optimizer.lr.StepDecay(
|
||||||
config.training.lr,
|
config.training.lr, config.training.anneal_interval,
|
||||||
config.training.anneal_interval,
|
|
||||||
config.training.anneal_rate)
|
config.training.anneal_rate)
|
||||||
optimizer = paddle.optimizer.Adam(
|
optimizer = paddle.optimizer.Adam(
|
||||||
lr_scheduler,
|
lr_scheduler,
|
||||||
parameters=model.parameters(),
|
parameters=model.parameters(),
|
||||||
grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm))
|
grad_clip=paddle.nn.ClipGradByGlobalNorm(
|
||||||
|
config.training.gradient_max_norm))
|
||||||
|
|
||||||
self.model = model
|
self.model = model
|
||||||
self.model_core = model._layer if self.parallel else model
|
self.model_core = model._layer if self.parallel else model
|
||||||
|
@ -56,7 +70,8 @@ class Experiment(ExperimentBase):
|
||||||
args = self.args
|
args = self.args
|
||||||
|
|
||||||
ljspeech_dataset = LJSpeech(args.data)
|
ljspeech_dataset = LJSpeech(args.data)
|
||||||
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
|
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||||
|
config.data.valid_size)
|
||||||
|
|
||||||
# convolutional net's causal padding size
|
# convolutional net's causal padding size
|
||||||
context_size = config.model.n_stack \
|
context_size = config.model.n_stack \
|
||||||
|
@ -66,20 +81,21 @@ class Experiment(ExperimentBase):
|
||||||
|
|
||||||
# frames used to compute loss
|
# frames used to compute loss
|
||||||
frames_per_second = config.data.sample_rate // config.data.hop_length
|
frames_per_second = config.data.sample_rate // config.data.hop_length
|
||||||
train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second)
|
train_clip_frames = math.ceil(config.data.train_clip_seconds *
|
||||||
|
frames_per_second)
|
||||||
|
|
||||||
num_frames = train_clip_frames + context_frames
|
num_frames = train_clip_frames + context_frames
|
||||||
batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
|
batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
|
||||||
if not self.parallel:
|
if not self.parallel:
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
collate_fn=batch_fn)
|
collate_fn=batch_fn)
|
||||||
else:
|
else:
|
||||||
sampler = DistributedBatchSampler(
|
sampler = DistributedBatchSampler(
|
||||||
train_set,
|
train_set,
|
||||||
batch_size=config.data.batch_size,
|
batch_size=config.data.batch_size,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
drop_last=True)
|
drop_last=True)
|
||||||
|
@ -89,7 +105,7 @@ class Experiment(ExperimentBase):
|
||||||
valid_batch_fn = LJSpeechCollector()
|
valid_batch_fn = LJSpeechCollector()
|
||||||
valid_loader = DataLoader(
|
valid_loader = DataLoader(
|
||||||
valid_set, batch_size=1, collate_fn=valid_batch_fn)
|
valid_set, batch_size=1, collate_fn=valid_batch_fn)
|
||||||
|
|
||||||
self.train_loader = train_loader
|
self.train_loader = train_loader
|
||||||
self.valid_loader = valid_loader
|
self.valid_loader = valid_loader
|
||||||
|
|
||||||
|
@ -101,20 +117,22 @@ class Experiment(ExperimentBase):
|
||||||
self.model.train()
|
self.model.train()
|
||||||
self.optimizer.clear_grad()
|
self.optimizer.clear_grad()
|
||||||
mel, wav, audio_starts = batch
|
mel, wav, audio_starts = batch
|
||||||
|
|
||||||
y = self.model(wav, mel, audio_starts)
|
y = self.model(wav, mel, audio_starts)
|
||||||
loss = self.model.loss(y, wav)
|
loss = self.model.loss(y, wav)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
iteration_time = time.time() - start
|
iteration_time = time.time() - start
|
||||||
|
|
||||||
loss_value = float(loss)
|
loss_value = float(loss)
|
||||||
msg = "Rank: {}, ".format(dist.get_rank())
|
msg = "Rank: {}, ".format(dist.get_rank())
|
||||||
msg += "step: {}, ".format(self.iteration)
|
msg += "step: {}, ".format(self.iteration)
|
||||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
|
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||||
|
iteration_time)
|
||||||
msg += "loss: {:>.6f}".format(loss_value)
|
msg += "loss: {:>.6f}".format(loss_value)
|
||||||
self.logger.info(msg)
|
self.logger.info(msg)
|
||||||
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
|
self.visualizer.add_scalar(
|
||||||
|
"train/loss", loss_value, global_step=self.iteration)
|
||||||
|
|
||||||
@mp_tools.rank_zero_only
|
@mp_tools.rank_zero_only
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
|
@ -126,7 +144,8 @@ class Experiment(ExperimentBase):
|
||||||
loss = self.model.loss(y, wav)
|
loss = self.model.loss(y, wav)
|
||||||
valid_losses.append(float(loss))
|
valid_losses.append(float(loss))
|
||||||
valid_loss = np.mean(valid_losses)
|
valid_loss = np.mean(valid_losses)
|
||||||
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
|
self.visualizer.add_scalar(
|
||||||
|
"valid/loss", valid_loss, global_step=self.iteration)
|
||||||
|
|
||||||
|
|
||||||
def main_sp(config, args):
|
def main_sp(config, args):
|
||||||
|
@ -146,7 +165,7 @@ if __name__ == "__main__":
|
||||||
config = get_cfg_defaults()
|
config = get_cfg_defaults()
|
||||||
parser = default_argument_parser()
|
parser = default_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.config:
|
if args.config:
|
||||||
config.merge_from_file(args.config)
|
config.merge_from_file(args.config)
|
||||||
if args.opts:
|
if args.opts:
|
||||||
config.merge_from_list(args.opts)
|
config.merge_from_list(args.opts)
|
||||||
|
|
|
@ -18,15 +18,16 @@ import numpy as np
|
||||||
|
|
||||||
__all__ = ["AudioProcessor"]
|
__all__ = ["AudioProcessor"]
|
||||||
|
|
||||||
|
|
||||||
class AudioProcessor(object):
|
class AudioProcessor(object):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
sample_rate:int,
|
sample_rate: int,
|
||||||
n_fft:int,
|
n_fft: int,
|
||||||
win_length:int,
|
win_length: int,
|
||||||
hop_length:int,
|
hop_length: int,
|
||||||
n_mels:int=80,
|
n_mels: int=80,
|
||||||
f_min:int=0,
|
f_min: int=0,
|
||||||
f_max:int=None,
|
f_max: int=None,
|
||||||
window="hann",
|
window="hann",
|
||||||
center=True,
|
center=True,
|
||||||
pad_mode="reflect"):
|
pad_mode="reflect"):
|
||||||
|
@ -40,7 +41,7 @@ class AudioProcessor(object):
|
||||||
self.window = window
|
self.window = window
|
||||||
self.center = center
|
self.center = center
|
||||||
self.pad_mode = pad_mode
|
self.pad_mode = pad_mode
|
||||||
|
|
||||||
# mel
|
# mel
|
||||||
self.n_mels = n_mels
|
self.n_mels = n_mels
|
||||||
self.f_min = f_min
|
self.f_min = f_min
|
||||||
|
@ -48,19 +49,18 @@ class AudioProcessor(object):
|
||||||
|
|
||||||
self.mel_filter = self._create_mel_filter()
|
self.mel_filter = self._create_mel_filter()
|
||||||
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
|
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
|
||||||
|
|
||||||
def _create_mel_filter(self):
|
def _create_mel_filter(self):
|
||||||
mel_filter = librosa.filters.mel(
|
mel_filter = librosa.filters.mel(self.sample_rate,
|
||||||
self.sample_rate,
|
self.n_fft,
|
||||||
self.n_fft,
|
n_mels=self.n_mels,
|
||||||
n_mels=self.n_mels,
|
fmin=self.f_min,
|
||||||
fmin=self.f_min,
|
fmax=self.f_max)
|
||||||
fmax=self.f_max)
|
|
||||||
return mel_filter
|
return mel_filter
|
||||||
|
|
||||||
def read_wav(self, filename):
|
def read_wav(self, filename):
|
||||||
# resampling may occur
|
# resampling may occur
|
||||||
wav, _ = librosa.load(filename, sr=self.sample_rate)
|
wav, _ = librosa.load(filename, sr=self.sample_rate)
|
||||||
return wav
|
return wav
|
||||||
|
|
||||||
def write_wav(self, path, wav):
|
def write_wav(self, path, wav):
|
||||||
|
@ -69,7 +69,7 @@ class AudioProcessor(object):
|
||||||
def stft(self, wav):
|
def stft(self, wav):
|
||||||
D = librosa.core.stft(
|
D = librosa.core.stft(
|
||||||
wav,
|
wav,
|
||||||
n_fft = self.n_fft,
|
n_fft=self.n_fft,
|
||||||
hop_length=self.hop_length,
|
hop_length=self.hop_length,
|
||||||
win_length=self.win_length,
|
win_length=self.win_length,
|
||||||
window=self.window,
|
window=self.window,
|
||||||
|
|
|
@ -1,3 +1,16 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This modules contains normalizers for spectrogram magnitude.
|
This modules contains normalizers for spectrogram magnitude.
|
||||||
|
@ -19,22 +32,24 @@ __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"]
|
||||||
class NormalizerBase(object):
|
class NormalizerBase(object):
|
||||||
def transform(self, spec):
|
def transform(self, spec):
|
||||||
raise NotImplementedError("transform must be implemented")
|
raise NotImplementedError("transform must be implemented")
|
||||||
|
|
||||||
def inverse(self, normalized):
|
def inverse(self, normalized):
|
||||||
raise NotImplementedError("inverse must be implemented")
|
raise NotImplementedError("inverse must be implemented")
|
||||||
|
|
||||||
|
|
||||||
class LogMagnitude(NormalizerBase):
|
class LogMagnitude(NormalizerBase):
|
||||||
"""
|
"""
|
||||||
This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
|
This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, min=1e-7):
|
def __init__(self, min=1e-7):
|
||||||
self.min = min
|
self.min = min
|
||||||
|
|
||||||
def transform(self, x):
|
def transform(self, x):
|
||||||
x = np.maximum(x, self.min)
|
x = np.maximum(x, self.min)
|
||||||
x = np.log(x)
|
x = np.log(x)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def inverse(self, x):
|
def inverse(self, x):
|
||||||
return np.exp(x)
|
return np.exp(x)
|
||||||
|
|
||||||
|
@ -44,15 +59,16 @@ class UnitMagnitude(NormalizerBase):
|
||||||
"""
|
"""
|
||||||
This is the normalizer used in the
|
This is the normalizer used in the
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, min=1e-5):
|
def __init__(self, min=1e-5):
|
||||||
self.min = min
|
self.min = min
|
||||||
|
|
||||||
def transform(self, x):
|
def transform(self, x):
|
||||||
db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20
|
db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20
|
||||||
normalized = (db_scale + 100) / 100
|
normalized = (db_scale + 100) / 100
|
||||||
clipped = np.clip(normalized, 0, 1)
|
clipped = np.clip(normalized, 0, 1)
|
||||||
return clipped
|
return clipped
|
||||||
|
|
||||||
def inverse(self, x):
|
def inverse(self, x):
|
||||||
denormalized = np.clip(x, 0, 1) * 100 - 100
|
denormalized = np.clip(x, 0, 1) * 100 - 100
|
||||||
out = np.exp((denormalized + 20) / 20 * np.log(10))
|
out = np.exp((denormalized + 20) / 20 * np.log(10))
|
||||||
|
|
|
@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided.
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"batch_text_id", "batch_wav", "batch_spec",
|
"batch_text_id",
|
||||||
"TextIDBatcher", "WavBatcher", "SpecBatcher",
|
"batch_wav",
|
||||||
|
"batch_spec",
|
||||||
|
"TextIDBatcher",
|
||||||
|
"WavBatcher",
|
||||||
|
"SpecBatcher",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class TextIDBatcher(object):
|
class TextIDBatcher(object):
|
||||||
"""A wrapper class for `batch_text_id`."""
|
"""A wrapper class for `batch_text_id`."""
|
||||||
|
|
||||||
|
@ -99,8 +104,8 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
|
||||||
pad_len = max_len - example.shape[-1]
|
pad_len = max_len - example.shape[-1]
|
||||||
batch.append(
|
batch.append(
|
||||||
np.pad(example, [(0, pad_len)],
|
np.pad(example, [(0, pad_len)],
|
||||||
mode='constant',
|
mode='constant',
|
||||||
constant_values=pad_value))
|
constant_values=pad_value))
|
||||||
return np.array(batch, dtype=dtype)
|
return np.array(batch, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
@ -113,7 +118,11 @@ class SpecBatcher(object):
|
||||||
self.time_major = time_major
|
self.time_major = time_major
|
||||||
|
|
||||||
def __call__(self, minibatch):
|
def __call__(self, minibatch):
|
||||||
out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype)
|
out = batch_spec(
|
||||||
|
minibatch,
|
||||||
|
pad_value=self.pad_value,
|
||||||
|
time_major=self.time_major,
|
||||||
|
dtype=self.dtype)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
|
||||||
"""
|
"""
|
||||||
# assume (F, T) or (T, F)
|
# assume (F, T) or (T, F)
|
||||||
peek_example = minibatch[0]
|
peek_example = minibatch[0]
|
||||||
assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram"
|
assert len(
|
||||||
|
peek_example.shape) == 2, "we only handles mono channel spectrogram"
|
||||||
|
|
||||||
# assume (F, n_frame) or (n_frame, F)
|
# assume (F, n_frame) or (n_frame, F)
|
||||||
time_idx = 0 if time_major else -1
|
time_idx = 0 if time_major else -1
|
||||||
|
@ -143,11 +153,11 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
|
||||||
if time_major:
|
if time_major:
|
||||||
batch.append(
|
batch.append(
|
||||||
np.pad(example, [(0, pad_len), (0, 0)],
|
np.pad(example, [(0, pad_len), (0, 0)],
|
||||||
mode='constant',
|
mode='constant',
|
||||||
constant_values=pad_value))
|
constant_values=pad_value))
|
||||||
else:
|
else:
|
||||||
batch.append(
|
batch.append(
|
||||||
np.pad(example, [(0, 0), (0, pad_len)],
|
np.pad(example, [(0, 0), (0, pad_len)],
|
||||||
mode='constant',
|
mode='constant',
|
||||||
constant_values=pad_value))
|
constant_values=pad_value))
|
||||||
return np.array(batch, dtype=dtype)
|
return np.array(batch, dtype=dtype)
|
||||||
|
|
|
@ -17,17 +17,25 @@ import paddle
|
||||||
from paddle.io import Dataset
|
from paddle.io import Dataset
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"split", "TransformDataset", "CacheDataset", "TupleDataset",
|
"split",
|
||||||
"DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset",
|
"TransformDataset",
|
||||||
|
"CacheDataset",
|
||||||
|
"TupleDataset",
|
||||||
|
"DictDataset",
|
||||||
|
"SliceDataset",
|
||||||
|
"SubsetDataset",
|
||||||
|
"FilterDataset",
|
||||||
"ChainDataset",
|
"ChainDataset",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def split(dataset, first_size):
|
def split(dataset, first_size):
|
||||||
"""A utility function to split a dataset into two datasets."""
|
"""A utility function to split a dataset into two datasets."""
|
||||||
first = SliceDataset(dataset, 0, first_size)
|
first = SliceDataset(dataset, 0, first_size)
|
||||||
second = SliceDataset(dataset, first_size, len(dataset))
|
second = SliceDataset(dataset, first_size, len(dataset))
|
||||||
return first, second
|
return first, second
|
||||||
|
|
||||||
|
|
||||||
class TransformDataset(Dataset):
|
class TransformDataset(Dataset):
|
||||||
def __init__(self, dataset, transform):
|
def __init__(self, dataset, transform):
|
||||||
"""Dataset which is transformed from another with a transform.
|
"""Dataset which is transformed from another with a transform.
|
||||||
|
@ -141,7 +149,7 @@ class DictDataset(Dataset):
|
||||||
for i in six.moves.range(length)]
|
for i in six.moves.range(length)]
|
||||||
else:
|
else:
|
||||||
return batches
|
return batches
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self._length
|
return self._length
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,16 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from parakeet.datasets.common import *
|
from parakeet.datasets.common import *
|
||||||
from parakeet.datasets.ljspeech import *
|
from parakeet.datasets.ljspeech import *
|
|
@ -1,9 +1,24 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from paddle.io import Dataset
|
from paddle.io import Dataset
|
||||||
import os
|
import os
|
||||||
import librosa
|
import librosa
|
||||||
|
|
||||||
__all__ = ["AudioFolderDataset"]
|
__all__ = ["AudioFolderDataset"]
|
||||||
|
|
||||||
|
|
||||||
class AudioFolderDataset(Dataset):
|
class AudioFolderDataset(Dataset):
|
||||||
def __init__(self, path, sample_rate, extension="wav"):
|
def __init__(self, path, sample_rate, extension="wav"):
|
||||||
self.root = os.path.expanduser(path)
|
self.root = os.path.expanduser(path)
|
||||||
|
@ -19,5 +34,5 @@ class AudioFolderDataset(Dataset):
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
file_name = self.file_names[i]
|
file_name = self.file_names[i]
|
||||||
y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable
|
y, _ = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable
|
||||||
return y
|
return y
|
||||||
|
|
|
@ -1,8 +1,23 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from paddle.io import Dataset
|
from paddle.io import Dataset
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
__all__ = ["LJSpeechMetaData"]
|
__all__ = ["LJSpeechMetaData"]
|
||||||
|
|
||||||
|
|
||||||
class LJSpeechMetaData(Dataset):
|
class LJSpeechMetaData(Dataset):
|
||||||
def __init__(self, root):
|
def __init__(self, root):
|
||||||
self.root = Path(root).expanduser()
|
self.root = Path(root).expanduser()
|
||||||
|
@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset):
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.records)
|
return len(self.records)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from parakeet.frontend.vocab import *
|
from parakeet.frontend.vocab import *
|
||||||
from parakeet.frontend.phonectic import *
|
from parakeet.frontend.phonectic import *
|
||||||
from parakeet.frontend.punctuation import *
|
from parakeet.frontend.punctuation import *
|
||||||
|
|
|
@ -1,2 +1,16 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from parakeet.frontend.normalizer.normalizer import *
|
from parakeet.frontend.normalizer.normalizer import *
|
||||||
from parakeet.frontend.normalizer.numbers import *
|
from parakeet.frontend.normalizer.numbers import *
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
|
@ -1,8 +1,22 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
def full2half_width(ustr):
|
def full2half_width(ustr):
|
||||||
half = []
|
half = []
|
||||||
for u in ustr:
|
for u in ustr:
|
||||||
num = ord(u)
|
num = ord(u)
|
||||||
if num == 0x3000: # 全角空格变半角
|
if num == 0x3000: # 全角空格变半角
|
||||||
num = 32
|
num = 32
|
||||||
elif 0xFF01 <= num <= 0xFF5E:
|
elif 0xFF01 <= num <= 0xFF5E:
|
||||||
num -= 0xfee0
|
num -= 0xfee0
|
||||||
|
@ -10,15 +24,16 @@ def full2half_width(ustr):
|
||||||
half.append(u)
|
half.append(u)
|
||||||
return ''.join(half)
|
return ''.join(half)
|
||||||
|
|
||||||
|
|
||||||
def half2full_width(ustr):
|
def half2full_width(ustr):
|
||||||
full = []
|
full = []
|
||||||
for u in ustr:
|
for u in ustr:
|
||||||
num = ord(u)
|
num = ord(u)
|
||||||
if num == 32: # 半角空格变全角
|
if num == 32: # 半角空格变全角
|
||||||
num = 0x3000
|
num = 0x3000
|
||||||
elif 0x21 <= num <= 0x7E:
|
elif 0x21 <= num <= 0x7E:
|
||||||
num += 0xfee0
|
num += 0xfee0
|
||||||
u = chr(num) # to unicode
|
u = chr(num) # to unicode
|
||||||
full.append(u)
|
full.append(u)
|
||||||
|
|
||||||
return ''.join(full)
|
return ''.join(full)
|
||||||
|
|
|
@ -17,7 +17,8 @@ from typing import Union
|
||||||
from g2p_en import G2p
|
from g2p_en import G2p
|
||||||
from g2pM import G2pM
|
from g2pM import G2pM
|
||||||
from parakeet.frontend import Vocab
|
from parakeet.frontend import Vocab
|
||||||
from opencc import OpenCC
|
# discard opencc untill we find an easy solution to install it on windows
|
||||||
|
# from opencc import OpenCC
|
||||||
from parakeet.frontend.punctuation import get_punctuations
|
from parakeet.frontend.punctuation import get_punctuations
|
||||||
from parakeet.frontend.normalizer.normalizer import normalize
|
from parakeet.frontend.normalizer.normalizer import normalize
|
||||||
|
|
||||||
|
@ -211,7 +212,7 @@ class Chinese(Phonetics):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.opencc_backend = OpenCC('t2s.json')
|
# self.opencc_backend = OpenCC('t2s.json')
|
||||||
self.backend = G2pM()
|
self.backend = G2pM()
|
||||||
self.phonemes = self._get_all_syllables()
|
self.phonemes = self._get_all_syllables()
|
||||||
self.punctuations = get_punctuations("cn")
|
self.punctuations = get_punctuations("cn")
|
||||||
|
@ -236,7 +237,8 @@ class Chinese(Phonetics):
|
||||||
List[str]
|
List[str]
|
||||||
The list of pronunciation sequence.
|
The list of pronunciation sequence.
|
||||||
"""
|
"""
|
||||||
simplified = self.opencc_backend.convert(sentence)
|
# simplified = self.opencc_backend.convert(sentence)
|
||||||
|
simplified = sentence
|
||||||
phonemes = self.backend(simplified)
|
phonemes = self.backend(simplified)
|
||||||
start = self.vocab.start_symbol
|
start = self.vocab.start_symbol
|
||||||
end = self.vocab.end_symbol
|
end = self.vocab.end_symbol
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import abc
|
import abc
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
@ -13,15 +27,8 @@ EN_PUNCT = [
|
||||||
"!",
|
"!",
|
||||||
]
|
]
|
||||||
|
|
||||||
CN_PUNCT = [
|
CN_PUNCT = ["、", ",", ";", ":", "。", "?", "!"]
|
||||||
"、",
|
|
||||||
",",
|
|
||||||
";",
|
|
||||||
":",
|
|
||||||
"。",
|
|
||||||
"?",
|
|
||||||
"!"
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_punctuations(lang):
|
def get_punctuations(lang):
|
||||||
if lang == "en":
|
if lang == "en":
|
||||||
|
@ -30,4 +37,3 @@ def get_punctuations(lang):
|
||||||
return CN_PUNCT
|
return CN_PUNCT
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"language {lang} Not supported")
|
raise ValueError(f"language {lang} Not supported")
|
||||||
|
|
||||||
|
|
|
@ -559,7 +559,7 @@ class TransformerTTS(nn.Layer):
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, frontend, config, checkpoint_path):
|
def from_pretrained(cls, frontend, config, checkpoint_path):
|
||||||
model = TransformerTTS(
|
model = TransformerTTS(
|
||||||
frontend,
|
frontend,
|
||||||
d_encoder=config.model.d_encoder,
|
d_encoder=config.model.d_encoder,
|
||||||
d_decoder=config.model.d_decoder,
|
d_decoder=config.model.d_decoder,
|
||||||
d_mel=config.data.d_mel,
|
d_mel=config.data.d_mel,
|
||||||
|
@ -575,11 +575,12 @@ class TransformerTTS(nn.Layer):
|
||||||
decoder_prenet_dropout=config.model.decoder_prenet_dropout,
|
decoder_prenet_dropout=config.model.decoder_prenet_dropout,
|
||||||
dropout=config.model.dropout)
|
dropout=config.model.dropout)
|
||||||
|
|
||||||
iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
iteration = checkpoint.load_parameters(
|
||||||
|
model, checkpoint_path=checkpoint_path)
|
||||||
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
||||||
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
||||||
model.set_constants(
|
model.set_constants(
|
||||||
reduction_factor=reduction_factor(iteration),
|
reduction_factor=reduction_factor(iteration),
|
||||||
drop_n_heads=drop_n_heads(iteration))
|
drop_n_heads=drop_n_heads(iteration))
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import List, Union, Tuple
|
from typing import List, Union, Tuple
|
||||||
|
@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo
|
||||||
|
|
||||||
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
||||||
|
|
||||||
|
|
||||||
def fold(x, n_group):
|
def fold(x, n_group):
|
||||||
r"""Fold audio or spectrogram's temporal dimension in to groups.
|
r"""Fold audio or spectrogram's temporal dimension in to groups.
|
||||||
|
|
||||||
|
@ -31,6 +46,7 @@ def fold(x, n_group):
|
||||||
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
||||||
return paddle.reshape(x, new_shape)
|
return paddle.reshape(x, new_shape)
|
||||||
|
|
||||||
|
|
||||||
class UpsampleNet(nn.LayerList):
|
class UpsampleNet(nn.LayerList):
|
||||||
"""Layer to upsample mel spectrogram to the same temporal resolution with
|
"""Layer to upsample mel spectrogram to the same temporal resolution with
|
||||||
the corresponding waveform.
|
the corresponding waveform.
|
||||||
|
@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList):
|
||||||
---------
|
---------
|
||||||
``librosa.core.stft``
|
``librosa.core.stft``
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, upsample_factors):
|
def __init__(self, upsample_factors):
|
||||||
super(UpsampleNet, self).__init__()
|
super(UpsampleNet, self).__init__()
|
||||||
for factor in upsample_factors:
|
for factor in upsample_factors:
|
||||||
|
@ -67,16 +84,18 @@ class UpsampleNet(nn.LayerList):
|
||||||
init = I.Uniform(-std, std)
|
init = I.Uniform(-std, std)
|
||||||
self.append(
|
self.append(
|
||||||
nn.utils.weight_norm(
|
nn.utils.weight_norm(
|
||||||
nn.Conv2DTranspose(1, 1, (3, 2 * factor),
|
nn.Conv2DTranspose(
|
||||||
|
1,
|
||||||
|
1, (3, 2 * factor),
|
||||||
padding=(1, factor // 2),
|
padding=(1, factor // 2),
|
||||||
stride=(1, factor),
|
stride=(1, factor),
|
||||||
weight_attr=init,
|
weight_attr=init,
|
||||||
bias_attr=init)))
|
bias_attr=init)))
|
||||||
|
|
||||||
# upsample factors
|
# upsample factors
|
||||||
self.upsample_factor = np.prod(upsample_factors)
|
self.upsample_factor = np.prod(upsample_factors)
|
||||||
self.upsample_factors = upsample_factors
|
self.upsample_factors = upsample_factors
|
||||||
|
|
||||||
def forward(self, x, trim_conv_artifact=False):
|
def forward(self, x, trim_conv_artifact=False):
|
||||||
r"""Forward pass of the ``UpsampleNet``.
|
r"""Forward pass of the ``UpsampleNet``.
|
||||||
|
|
||||||
|
@ -131,38 +150,47 @@ class ResidualBlock(nn.Layer):
|
||||||
dilations : int
|
dilations : int
|
||||||
Dilations of the Convolution2d applied to the input.
|
Dilations of the Convolution2d applied to the input.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
||||||
super(ResidualBlock, self).__init__()
|
super(ResidualBlock, self).__init__()
|
||||||
# input conv
|
# input conv
|
||||||
std = math.sqrt(1 / channels * np.prod(kernel_size))
|
std = math.sqrt(1 / channels * np.prod(kernel_size))
|
||||||
init = I.Uniform(-std, std)
|
init = I.Uniform(-std, std)
|
||||||
receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)]
|
receptive_field = [
|
||||||
|
1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
|
||||||
|
]
|
||||||
rh, rw = receptive_field
|
rh, rw = receptive_field
|
||||||
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
|
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
|
||||||
conv = nn.Conv2D(channels, 2 * channels, kernel_size,
|
conv = nn.Conv2D(
|
||||||
padding=paddings,
|
channels,
|
||||||
dilation=dilations,
|
2 * channels,
|
||||||
weight_attr=init,
|
kernel_size,
|
||||||
bias_attr=init)
|
padding=paddings,
|
||||||
|
dilation=dilations,
|
||||||
|
weight_attr=init,
|
||||||
|
bias_attr=init)
|
||||||
self.conv = nn.utils.weight_norm(conv)
|
self.conv = nn.utils.weight_norm(conv)
|
||||||
self.rh = rh
|
self.rh = rh
|
||||||
self.rw = rw
|
self.rw = rw
|
||||||
self.dilations = dilations
|
self.dilations = dilations
|
||||||
|
|
||||||
# condition projection
|
# condition projection
|
||||||
std = math.sqrt(1 / cond_channels)
|
std = math.sqrt(1 / cond_channels)
|
||||||
init = I.Uniform(-std, std)
|
init = I.Uniform(-std, std)
|
||||||
condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1),
|
condition_proj = nn.Conv2D(
|
||||||
weight_attr=init, bias_attr=init)
|
cond_channels,
|
||||||
|
2 * channels, (1, 1),
|
||||||
|
weight_attr=init,
|
||||||
|
bias_attr=init)
|
||||||
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
||||||
|
|
||||||
# parametric residual & skip connection
|
# parametric residual & skip connection
|
||||||
std = math.sqrt(1 / channels)
|
std = math.sqrt(1 / channels)
|
||||||
init = I.Uniform(-std, std)
|
init = I.Uniform(-std, std)
|
||||||
out_proj = nn.Conv2D(channels, 2 * channels, (1, 1),
|
out_proj = nn.Conv2D(
|
||||||
weight_attr=init, bias_attr=init)
|
channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
|
||||||
self.out_proj = nn.utils.weight_norm(out_proj)
|
self.out_proj = nn.utils.weight_norm(out_proj)
|
||||||
|
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Compute output for a whole folded sequence.
|
"""Compute output for a whole folded sequence.
|
||||||
|
|
||||||
|
@ -185,10 +213,10 @@ class ResidualBlock(nn.Layer):
|
||||||
x_in = x
|
x_in = x
|
||||||
x = self.conv(x)
|
x = self.conv(x)
|
||||||
x += self.condition_proj(condition)
|
x += self.condition_proj(condition)
|
||||||
|
|
||||||
content, gate = paddle.chunk(x, 2, axis=1)
|
content, gate = paddle.chunk(x, 2, axis=1)
|
||||||
x = paddle.tanh(content) * F.sigmoid(gate)
|
x = paddle.tanh(content) * F.sigmoid(gate)
|
||||||
|
|
||||||
x = self.out_proj(x)
|
x = self.out_proj(x)
|
||||||
res, skip = paddle.chunk(x, 2, axis=1)
|
res, skip = paddle.chunk(x, 2, axis=1)
|
||||||
res = x_in + res
|
res = x_in + res
|
||||||
|
@ -249,7 +277,7 @@ class ResidualBlock(nn.Layer):
|
||||||
|
|
||||||
content, gate = paddle.chunk(x_row, 2, axis=1)
|
content, gate = paddle.chunk(x_row, 2, axis=1)
|
||||||
x_row = paddle.tanh(content) * F.sigmoid(gate)
|
x_row = paddle.tanh(content) * F.sigmoid(gate)
|
||||||
|
|
||||||
x_row = self.out_proj(x_row)
|
x_row = self.out_proj(x_row)
|
||||||
res, skip = paddle.chunk(x_row, 2, axis=1)
|
res, skip = paddle.chunk(x_row, 2, axis=1)
|
||||||
res = x_row_in + res
|
res = x_row_in + res
|
||||||
|
@ -290,20 +318,23 @@ class ResidualNet(nn.LayerList):
|
||||||
ValueError
|
ValueError
|
||||||
If the length of dilations_h does not equals n_layers.
|
If the length of dilations_h does not equals n_layers.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
n_layer: int,
|
def __init__(self,
|
||||||
residual_channels: int,
|
n_layer: int,
|
||||||
condition_channels: int,
|
residual_channels: int,
|
||||||
kernel_size: Tuple[int],
|
condition_channels: int,
|
||||||
|
kernel_size: Tuple[int],
|
||||||
dilations_h: List[int]):
|
dilations_h: List[int]):
|
||||||
if len(dilations_h) != n_layer:
|
if len(dilations_h) != n_layer:
|
||||||
raise ValueError("number of dilations_h should equals num of layers")
|
raise ValueError(
|
||||||
|
"number of dilations_h should equals num of layers")
|
||||||
super(ResidualNet, self).__init__()
|
super(ResidualNet, self).__init__()
|
||||||
for i in range(n_layer):
|
for i in range(n_layer):
|
||||||
dilation = (dilations_h[i], 2 ** i)
|
dilation = (dilations_h[i], 2**i)
|
||||||
layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation)
|
layer = ResidualBlock(residual_channels, condition_channels,
|
||||||
|
kernel_size, dilation)
|
||||||
self.append(layer)
|
self.append(layer)
|
||||||
|
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Comput the output of given the input and the condition.
|
"""Comput the output of given the input and the condition.
|
||||||
|
|
||||||
|
@ -332,7 +363,7 @@ class ResidualNet(nn.LayerList):
|
||||||
"""
|
"""
|
||||||
for layer in self:
|
for layer in self:
|
||||||
layer.start_sequence()
|
layer.start_sequence()
|
||||||
|
|
||||||
def add_input(self, x_row, condition_row):
|
def add_input(self, x_row, condition_row):
|
||||||
"""Compute the output for a row and update the buffers.
|
"""Compute the output for a row and update the buffers.
|
||||||
|
|
||||||
|
@ -386,33 +417,37 @@ class Flow(nn.Layer):
|
||||||
Number of timesteps to the folded into a group.
|
Number of timesteps to the folded into a group.
|
||||||
"""
|
"""
|
||||||
dilations_dict = {
|
dilations_dict = {
|
||||||
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
||||||
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
||||||
128: [1, 2, 4, 8, 16, 32, 64, 1]
|
128: [1, 2, 4, 8, 16, 32, 64, 1]
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
|
def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
|
||||||
super(Flow, self).__init__()
|
super(Flow, self).__init__()
|
||||||
# input projection
|
# input projection
|
||||||
self.input_proj = nn.utils.weight_norm(
|
self.input_proj = nn.utils.weight_norm(
|
||||||
nn.Conv2D(1, channels, (1, 1),
|
nn.Conv2D(
|
||||||
weight_attr=I.Uniform(-1., 1.),
|
1,
|
||||||
bias_attr=I.Uniform(-1., 1.)))
|
channels, (1, 1),
|
||||||
|
weight_attr=I.Uniform(-1., 1.),
|
||||||
|
bias_attr=I.Uniform(-1., 1.)))
|
||||||
|
|
||||||
# residual net
|
# residual net
|
||||||
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
|
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
|
||||||
self.dilations_dict[n_group])
|
self.dilations_dict[n_group])
|
||||||
|
|
||||||
# output projection
|
# output projection
|
||||||
self.output_proj = nn.Conv2D(channels, 2, (1, 1),
|
self.output_proj = nn.Conv2D(
|
||||||
weight_attr=I.Constant(0.),
|
channels,
|
||||||
bias_attr=I.Constant(0.))
|
2, (1, 1),
|
||||||
|
weight_attr=I.Constant(0.),
|
||||||
|
bias_attr=I.Constant(0.))
|
||||||
|
|
||||||
# specs
|
# specs
|
||||||
self.n_group = n_group
|
self.n_group = n_group
|
||||||
|
|
||||||
def _predict_parameters(self, x, condition):
|
def _predict_parameters(self, x, condition):
|
||||||
x = self.input_proj(x)
|
x = self.input_proj(x)
|
||||||
x = self.resnet(x, condition)
|
x = self.resnet(x, condition)
|
||||||
|
@ -421,11 +456,11 @@ class Flow(nn.Layer):
|
||||||
return logs, b
|
return logs, b
|
||||||
|
|
||||||
def _transform(self, x, logs, b):
|
def _transform(self, x, logs, b):
|
||||||
z_0 = x[:, :, :1, :] # the first row, just copy it
|
z_0 = x[:, :, :1, :] # the first row, just copy it
|
||||||
z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
|
z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
|
||||||
z_out = paddle.concat([z_0, z_out], axis=2)
|
z_out = paddle.concat([z_0, z_out], axis=2)
|
||||||
return z_out
|
return z_out
|
||||||
|
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Probability density estimation. It is done by inversely transform
|
"""Probability density estimation. It is done by inversely transform
|
||||||
a sample from p(X) into a sample from p(Z).
|
a sample from p(X) into a sample from p(Z).
|
||||||
|
@ -452,8 +487,8 @@ class Flow(nn.Layer):
|
||||||
transformation from x to z.
|
transformation from x to z.
|
||||||
"""
|
"""
|
||||||
# (B, C, H-1, W)
|
# (B, C, H-1, W)
|
||||||
logs, b = self._predict_parameters(
|
logs, b = self._predict_parameters(x[:, :, :-1, :],
|
||||||
x[:, :, :-1, :], condition[:, :, 1:, :])
|
condition[:, :, 1:, :])
|
||||||
z = self._transform(x, logs, b)
|
z = self._transform(x, logs, b)
|
||||||
return z, (logs, b)
|
return z, (logs, b)
|
||||||
|
|
||||||
|
@ -467,7 +502,7 @@ class Flow(nn.Layer):
|
||||||
def _inverse_transform_row(self, z_row, logs, b):
|
def _inverse_transform_row(self, z_row, logs, b):
|
||||||
x_row = (z_row - b) * paddle.exp(-logs)
|
x_row = (z_row - b) * paddle.exp(-logs)
|
||||||
return x_row
|
return x_row
|
||||||
|
|
||||||
def _inverse_row(self, z_row, x_row, condition_row):
|
def _inverse_row(self, z_row, x_row, condition_row):
|
||||||
logs, b = self._predict_row_parameters(x_row, condition_row)
|
logs, b = self._predict_row_parameters(x_row, condition_row)
|
||||||
x_next_row = self._inverse_transform_row(z_row, logs, b)
|
x_next_row = self._inverse_transform_row(z_row, logs, b)
|
||||||
|
@ -475,7 +510,7 @@ class Flow(nn.Layer):
|
||||||
|
|
||||||
def _start_sequence(self):
|
def _start_sequence(self):
|
||||||
self.resnet.start_sequence()
|
self.resnet.start_sequence()
|
||||||
|
|
||||||
def inverse(self, z, condition):
|
def inverse(self, z, condition):
|
||||||
"""Sampling from the the distrition p(X). It is done by sample form
|
"""Sampling from the the distrition p(X). It is done by sample form
|
||||||
p(Z) and transform the sample. It is a auto regressive transformation.
|
p(Z) and transform the sample. It is a auto regressive transformation.
|
||||||
|
@ -510,15 +545,16 @@ class Flow(nn.Layer):
|
||||||
|
|
||||||
self._start_sequence()
|
self._start_sequence()
|
||||||
for i in range(1, self.n_group):
|
for i in range(1, self.n_group):
|
||||||
x_row = x[-1] # actuallt i-1:i
|
x_row = x[-1] # actuallt i-1:i
|
||||||
z_row = z[:, :, i:i+1, :]
|
z_row = z[:, :, i:i + 1, :]
|
||||||
condition_row = condition[:, :, i:i+1, :]
|
condition_row = condition[:, :, i:i + 1, :]
|
||||||
|
|
||||||
x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row)
|
x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
|
||||||
|
condition_row)
|
||||||
x.append(x_next_row)
|
x.append(x_next_row)
|
||||||
logs_list.append(logs)
|
logs_list.append(logs)
|
||||||
b_list.append(b)
|
b_list.append(b)
|
||||||
|
|
||||||
x = paddle.concat(x, 2)
|
x = paddle.concat(x, 2)
|
||||||
logs = paddle.concat(logs_list, 2)
|
logs = paddle.concat(logs_list, 2)
|
||||||
b = paddle.concat(b_list, 2)
|
b = paddle.concat(b_list, 2)
|
||||||
|
@ -549,21 +585,25 @@ class WaveFlow(nn.LayerList):
|
||||||
kernel_size : Union[int, List[int]]
|
kernel_size : Union[int, List[int]]
|
||||||
Kernel size of the convolution layer in each ResidualBlock.
|
Kernel size of the convolution layer in each ResidualBlock.
|
||||||
"""
|
"""
|
||||||
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
|
|
||||||
|
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
|
||||||
|
kernel_size):
|
||||||
if n_group % 2 or n_flows % 2:
|
if n_group % 2 or n_flows % 2:
|
||||||
raise ValueError("number of flows and number of group must be even "
|
raise ValueError(
|
||||||
"since a permutation along group among flows is used.")
|
"number of flows and number of group must be even "
|
||||||
|
"since a permutation along group among flows is used.")
|
||||||
super(WaveFlow, self).__init__()
|
super(WaveFlow, self).__init__()
|
||||||
for _ in range(n_flows):
|
for _ in range(n_flows):
|
||||||
self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group))
|
self.append(
|
||||||
|
Flow(n_layers, channels, mel_bands, kernel_size, n_group))
|
||||||
|
|
||||||
# permutations in h
|
# permutations in h
|
||||||
self.perms = self._create_perm(n_group, n_flows)
|
self.perms = self._create_perm(n_group, n_flows)
|
||||||
|
|
||||||
# specs
|
# specs
|
||||||
self.n_group = n_group
|
self.n_group = n_group
|
||||||
self.n_flows = n_flows
|
self.n_flows = n_flows
|
||||||
|
|
||||||
def _create_perm(self, n_group, n_flows):
|
def _create_perm(self, n_group, n_flows):
|
||||||
indices = list(range(n_group))
|
indices = list(range(n_group))
|
||||||
half = n_group // 2
|
half = n_group // 2
|
||||||
|
@ -572,20 +612,21 @@ class WaveFlow(nn.LayerList):
|
||||||
if i < n_flows // 2:
|
if i < n_flows // 2:
|
||||||
perms.append(indices[::-1])
|
perms.append(indices[::-1])
|
||||||
else:
|
else:
|
||||||
perm = list(reversed(indices[:half])) + list(reversed(indices[half:]))
|
perm = list(reversed(indices[:half])) + list(
|
||||||
|
reversed(indices[half:]))
|
||||||
perms.append(perm)
|
perms.append(perm)
|
||||||
return perms
|
return perms
|
||||||
|
|
||||||
def _trim(self, x, condition):
|
def _trim(self, x, condition):
|
||||||
assert condition.shape[-1] >= x.shape[-1]
|
assert condition.shape[-1] >= x.shape[-1]
|
||||||
pruned_len = int(x.shape[-1] // self.n_group * self.n_group)
|
pruned_len = int(x.shape[-1] // self.n_group * self.n_group)
|
||||||
|
|
||||||
if x.shape[-1] > pruned_len:
|
if x.shape[-1] > pruned_len:
|
||||||
x = x[:, :pruned_len]
|
x = x[:, :pruned_len]
|
||||||
if condition.shape[-1] > pruned_len:
|
if condition.shape[-1] > pruned_len:
|
||||||
condition = condition[:, :, :pruned_len]
|
condition = condition[:, :, :pruned_len]
|
||||||
return x, condition
|
return x, condition
|
||||||
|
|
||||||
def forward(self, x, condition):
|
def forward(self, x, condition):
|
||||||
"""Probability density estimation of random variable x given the
|
"""Probability density estimation of random variable x given the
|
||||||
condition.
|
condition.
|
||||||
|
@ -610,21 +651,23 @@ class WaveFlow(nn.LayerList):
|
||||||
# x: (B, T)
|
# x: (B, T)
|
||||||
# condition: (B, C, T) upsampled condition
|
# condition: (B, C, T) upsampled condition
|
||||||
x, condition = self._trim(x, condition)
|
x, condition = self._trim(x, condition)
|
||||||
|
|
||||||
# to (B, C, h, T//h) layout
|
# to (B, C, h, T//h) layout
|
||||||
x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
x = paddle.unsqueeze(
|
||||||
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
|
paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
||||||
|
condition = paddle.transpose(
|
||||||
|
fold(condition, self.n_group), [0, 1, 3, 2])
|
||||||
|
|
||||||
# flows
|
# flows
|
||||||
logs_list = []
|
logs_list = []
|
||||||
for i, layer in enumerate(self):
|
for i, layer in enumerate(self):
|
||||||
x, (logs, b) = layer(x, condition)
|
x, (logs, b) = layer(x, condition)
|
||||||
logs_list.append(logs)
|
logs_list.append(logs)
|
||||||
# permute paddle has no shuffle dim
|
# permute paddle has no shuffle dim
|
||||||
x = geo.shuffle_dim(x, 2, perm=self.perms[i])
|
x = geo.shuffle_dim(x, 2, perm=self.perms[i])
|
||||||
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
||||||
|
|
||||||
z = paddle.squeeze(x, 1) # (B, H, W)
|
z = paddle.squeeze(x, 1) # (B, H, W)
|
||||||
batch_size = z.shape[0]
|
batch_size = z.shape[0]
|
||||||
z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
|
z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
|
||||||
|
|
||||||
|
@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList):
|
||||||
|
|
||||||
z, condition = self._trim(z, condition)
|
z, condition = self._trim(z, condition)
|
||||||
# to (B, C, h, T//h) layout
|
# to (B, C, h, T//h) layout
|
||||||
z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
z = paddle.unsqueeze(
|
||||||
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
|
paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
||||||
|
condition = paddle.transpose(
|
||||||
|
fold(condition, self.n_group), [0, 1, 3, 2])
|
||||||
|
|
||||||
# reverse it flow by flow
|
# reverse it flow by flow
|
||||||
for i in reversed(range(self.n_flows)):
|
for i in reversed(range(self.n_flows)):
|
||||||
|
@ -663,7 +708,7 @@ class WaveFlow(nn.LayerList):
|
||||||
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
||||||
z, (logs, b) = self[i].inverse(z, condition)
|
z, (logs, b) = self[i].inverse(z, condition)
|
||||||
|
|
||||||
x = paddle.squeeze(z, 1) # (B, H, W)
|
x = paddle.squeeze(z, 1) # (B, H, W)
|
||||||
batch_size = x.shape[0]
|
batch_size = x.shape[0]
|
||||||
x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
|
x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
|
||||||
return x
|
return x
|
||||||
|
@ -695,23 +740,24 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
kernel_size : Union[int, List[int]]
|
kernel_size : Union[int, List[int]]
|
||||||
Kernel size of the convolution layer in each ResidualBlock.
|
Kernel size of the convolution layer in each ResidualBlock.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
upsample_factors: List[int],
|
def __init__(self,
|
||||||
n_flows: int,
|
upsample_factors: List[int],
|
||||||
n_layers: int,
|
n_flows: int,
|
||||||
n_group: int,
|
n_layers: int,
|
||||||
channels: int,
|
n_group: int,
|
||||||
n_mels: int,
|
channels: int,
|
||||||
kernel_size: Union[int, List[int]]):
|
n_mels: int,
|
||||||
|
kernel_size: Union[int, List[int]]):
|
||||||
super(ConditionalWaveFlow, self).__init__()
|
super(ConditionalWaveFlow, self).__init__()
|
||||||
self.encoder = UpsampleNet(upsample_factors)
|
self.encoder = UpsampleNet(upsample_factors)
|
||||||
self.decoder = WaveFlow(
|
self.decoder = WaveFlow(
|
||||||
n_flows=n_flows,
|
n_flows=n_flows,
|
||||||
n_layers=n_layers,
|
n_layers=n_layers,
|
||||||
n_group=n_group,
|
n_group=n_group,
|
||||||
channels=channels,
|
channels=channels,
|
||||||
mel_bands=n_mels,
|
mel_bands=n_mels,
|
||||||
kernel_size=kernel_size)
|
kernel_size=kernel_size)
|
||||||
|
|
||||||
def forward(self, audio, mel):
|
def forward(self, audio, mel):
|
||||||
"""Compute the transformed random variable z (x to z) and the log of
|
"""Compute the transformed random variable z (x to z) and the log of
|
||||||
|
@ -737,7 +783,7 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
condition = self.encoder(mel)
|
condition = self.encoder(mel)
|
||||||
z, log_det_jacobian = self.decoder(audio, condition)
|
z, log_det_jacobian = self.decoder(audio, condition)
|
||||||
return z, log_det_jacobian
|
return z, log_det_jacobian
|
||||||
|
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
def infer(self, mel):
|
def infer(self, mel):
|
||||||
r"""Generate raw audio given mel spectrogram.
|
r"""Generate raw audio given mel spectrogram.
|
||||||
|
@ -752,12 +798,12 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
Tensor : [shape=(B, T)]
|
Tensor : [shape=(B, T)]
|
||||||
The synthesized audio, where``T <= T_mel \* upsample_factors``.
|
The synthesized audio, where``T <= T_mel \* upsample_factors``.
|
||||||
"""
|
"""
|
||||||
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
|
condition = self.encoder(mel, trim_conv_artifact=True) #(B, C, T)
|
||||||
batch_size, _, time_steps = condition.shape
|
batch_size, _, time_steps = condition.shape
|
||||||
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
||||||
x = self.decoder.inverse(z, condition)
|
x = self.decoder.inverse(z, condition)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
@paddle.no_grad()
|
@paddle.no_grad()
|
||||||
def predict(self, mel):
|
def predict(self, mel):
|
||||||
"""Generate raw audio given mel spectrogram.
|
"""Generate raw audio given mel spectrogram.
|
||||||
|
@ -777,7 +823,7 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
audio = self.infer(mel)
|
audio = self.infer(mel)
|
||||||
audio = audio[0].numpy()
|
audio = audio[0].numpy()
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, config, checkpoint_path):
|
def from_pretrained(cls, config, checkpoint_path):
|
||||||
"""Build a ConditionalWaveFlow model from a pretrained model.
|
"""Build a ConditionalWaveFlow model from a pretrained model.
|
||||||
|
@ -795,14 +841,13 @@ class ConditionalWaveFlow(nn.LayerList):
|
||||||
ConditionalWaveFlow
|
ConditionalWaveFlow
|
||||||
The model built from pretrained result.
|
The model built from pretrained result.
|
||||||
"""
|
"""
|
||||||
model = cls(
|
model = cls(upsample_factors=config.model.upsample_factors,
|
||||||
upsample_factors=config.model.upsample_factors,
|
n_flows=config.model.n_flows,
|
||||||
n_flows=config.model.n_flows,
|
n_layers=config.model.n_layers,
|
||||||
n_layers=config.model.n_layers,
|
n_group=config.model.n_group,
|
||||||
n_group=config.model.n_group,
|
channels=config.model.channels,
|
||||||
channels=config.model.channels,
|
n_mels=config.data.n_mels,
|
||||||
n_mels=config.data.n_mels,
|
kernel_size=config.model.kernel_size)
|
||||||
kernel_size=config.model.kernel_size)
|
|
||||||
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer):
|
||||||
The standard deviation of the gaussian noise used in WaveFlow, by
|
The standard deviation of the gaussian noise used in WaveFlow, by
|
||||||
default 1.0.
|
default 1.0.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sigma=1.0):
|
def __init__(self, sigma=1.0):
|
||||||
super(WaveFlowLoss, self).__init__()
|
super(WaveFlowLoss, self).__init__()
|
||||||
self.sigma = sigma
|
self.sigma = sigma
|
||||||
|
@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer):
|
||||||
Tensor [shape=(1,)]
|
Tensor [shape=(1,)]
|
||||||
The loss.
|
The loss.
|
||||||
"""
|
"""
|
||||||
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
|
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
|
||||||
|
) - log_det_jacobian
|
||||||
loss = loss / np.prod(z.shape)
|
loss = loss / np.prod(z.shape)
|
||||||
return loss + self.const
|
return loss + self.const
|
||||||
|
|
|
@ -18,7 +18,7 @@ from typing import Union, Sequence, List
|
||||||
from tqdm import trange
|
from tqdm import trange
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
from paddle.nn import functional as F
|
from paddle.nn import functional as F
|
||||||
import paddle.fluid.initializer as I
|
import paddle.fluid.initializer as I
|
||||||
|
@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools
|
||||||
|
|
||||||
__all__ = ["WaveNet", "ConditionalWaveNet"]
|
__all__ = ["WaveNet", "ConditionalWaveNet"]
|
||||||
|
|
||||||
|
|
||||||
def crop(x, audio_start, audio_length):
|
def crop(x, audio_start, audio_length):
|
||||||
"""Crop the upsampled condition to match audio_length.
|
"""Crop the upsampled condition to match audio_length.
|
||||||
|
|
||||||
|
@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList):
|
||||||
---------
|
---------
|
||||||
``librosa.core.stft``
|
``librosa.core.stft``
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, upscale_factors=[16, 16]):
|
def __init__(self, upscale_factors=[16, 16]):
|
||||||
super(UpsampleNet, self).__init__()
|
super(UpsampleNet, self).__init__()
|
||||||
self.upscale_factors = list(upscale_factors)
|
self.upscale_factors = list(upscale_factors)
|
||||||
|
@ -106,9 +108,11 @@ class UpsampleNet(nn.LayerList):
|
||||||
for factor in self.upscale_factors:
|
for factor in self.upscale_factors:
|
||||||
self.append(
|
self.append(
|
||||||
nn.utils.weight_norm(
|
nn.utils.weight_norm(
|
||||||
nn.Conv2DTranspose(1, 1,
|
nn.Conv2DTranspose(
|
||||||
kernel_size=(3, 2 * factor),
|
1,
|
||||||
stride=(1, factor),
|
1,
|
||||||
|
kernel_size=(3, 2 * factor),
|
||||||
|
stride=(1, factor),
|
||||||
padding=(1, factor // 2))))
|
padding=(1, factor // 2))))
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
@ -159,29 +163,34 @@ class ResidualBlock(nn.Layer):
|
||||||
dilation :int
|
dilation :int
|
||||||
Dilation of the internal convolution cells.
|
Dilation of the internal convolution cells.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
residual_channels: int,
|
def __init__(self,
|
||||||
condition_dim: int,
|
residual_channels: int,
|
||||||
|
condition_dim: int,
|
||||||
filter_size: Union[int, Sequence[int]],
|
filter_size: Union[int, Sequence[int]],
|
||||||
dilation: int):
|
dilation: int):
|
||||||
|
|
||||||
super(ResidualBlock, self).__init__()
|
super(ResidualBlock, self).__init__()
|
||||||
dilated_channels = 2 * residual_channels
|
dilated_channels = 2 * residual_channels
|
||||||
# following clarinet's implementation, we do not have parametric residual
|
# following clarinet's implementation, we do not have parametric residual
|
||||||
# & skip connection.
|
# & skip connection.
|
||||||
|
|
||||||
_filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size
|
_filter_size = filter_size[0] if isinstance(filter_size, (
|
||||||
|
list, tuple)) else filter_size
|
||||||
std = math.sqrt(1 / (_filter_size * residual_channels))
|
std = math.sqrt(1 / (_filter_size * residual_channels))
|
||||||
conv = Conv1dCell(residual_channels,
|
conv = Conv1dCell(
|
||||||
dilated_channels,
|
residual_channels,
|
||||||
filter_size,
|
dilated_channels,
|
||||||
dilation=dilation,
|
filter_size,
|
||||||
weight_attr=I.Normal(scale=std))
|
dilation=dilation,
|
||||||
|
weight_attr=I.Normal(scale=std))
|
||||||
self.conv = nn.utils.weight_norm(conv)
|
self.conv = nn.utils.weight_norm(conv)
|
||||||
|
|
||||||
std = math.sqrt(1 / condition_dim)
|
std = math.sqrt(1 / condition_dim)
|
||||||
condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,),
|
condition_proj = Conv1dCell(
|
||||||
weight_attr=I.Normal(scale=std))
|
condition_dim,
|
||||||
|
dilated_channels, (1, ),
|
||||||
|
weight_attr=I.Normal(scale=std))
|
||||||
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
||||||
|
|
||||||
self.filter_size = filter_size
|
self.filter_size = filter_size
|
||||||
|
@ -309,10 +318,11 @@ class ResidualNet(nn.LayerList):
|
||||||
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
|
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
n_stack: int,
|
def __init__(self,
|
||||||
n_loop: int,
|
n_stack: int,
|
||||||
residual_channels: int,
|
n_loop: int,
|
||||||
|
residual_channels: int,
|
||||||
condition_dim: int,
|
condition_dim: int,
|
||||||
filter_size: int):
|
filter_size: int):
|
||||||
super(ResidualNet, self).__init__()
|
super(ResidualNet, self).__init__()
|
||||||
|
@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList):
|
||||||
dilations = [2**i for i in range(n_loop)] * n_stack
|
dilations = [2**i for i in range(n_loop)] * n_stack
|
||||||
self.context_size = 1 + sum(dilations)
|
self.context_size = 1 + sum(dilations)
|
||||||
for dilation in dilations:
|
for dilation in dilations:
|
||||||
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
|
self.append(
|
||||||
|
ResidualBlock(residual_channels, condition_dim, filter_size,
|
||||||
|
dilation))
|
||||||
|
|
||||||
def forward(self, x, condition=None):
|
def forward(self, x, condition=None):
|
||||||
"""Forward pass of ``ResidualNet``.
|
"""Forward pass of ``ResidualNet``.
|
||||||
|
@ -345,7 +357,7 @@ class ResidualNet(nn.LayerList):
|
||||||
skip_connections = skip
|
skip_connections = skip
|
||||||
else:
|
else:
|
||||||
skip_connections = paddle.scale(skip_connections + skip,
|
skip_connections = paddle.scale(skip_connections + skip,
|
||||||
math.sqrt(0.5))
|
math.sqrt(0.5))
|
||||||
return skip_connections
|
return skip_connections
|
||||||
|
|
||||||
def start_sequence(self):
|
def start_sequence(self):
|
||||||
|
@ -381,7 +393,7 @@ class ResidualNet(nn.LayerList):
|
||||||
skip_connections = skip
|
skip_connections = skip
|
||||||
else:
|
else:
|
||||||
skip_connections = paddle.scale(skip_connections + skip,
|
skip_connections = paddle.scale(skip_connections + skip,
|
||||||
math.sqrt(0.5))
|
math.sqrt(0.5))
|
||||||
return skip_connections
|
return skip_connections
|
||||||
|
|
||||||
|
|
||||||
|
@ -426,6 +438,7 @@ class WaveNet(nn.Layer):
|
||||||
This is only used for computing loss when ``loss_type`` is "mog", If
|
This is only used for computing loss when ``loss_type`` is "mog", If
|
||||||
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, n_stack, n_loop, residual_channels, output_dim,
|
def __init__(self, n_stack, n_loop, residual_channels, output_dim,
|
||||||
condition_dim, filter_size, loss_type, log_scale_min):
|
condition_dim, filter_size, loss_type, log_scale_min):
|
||||||
|
|
||||||
|
@ -437,19 +450,24 @@ class WaveNet(nn.Layer):
|
||||||
else:
|
else:
|
||||||
if (output_dim % 3 != 0):
|
if (output_dim % 3 != 0):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim))
|
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".
|
||||||
self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1)
|
format(output_dim))
|
||||||
|
self.embed = nn.utils.weight_norm(
|
||||||
|
nn.Linear(1, residual_channels), dim=1)
|
||||||
|
|
||||||
self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
|
self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
|
||||||
condition_dim, filter_size)
|
condition_dim, filter_size)
|
||||||
self.context_size = self.resnet.context_size
|
self.context_size = self.resnet.context_size
|
||||||
|
|
||||||
skip_channels = residual_channels # assume the same channel
|
skip_channels = residual_channels # assume the same channel
|
||||||
self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
|
self.proj1 = nn.utils.weight_norm(
|
||||||
self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
|
nn.Linear(skip_channels, skip_channels), dim=1)
|
||||||
|
self.proj2 = nn.utils.weight_norm(
|
||||||
|
nn.Linear(skip_channels, skip_channels), dim=1)
|
||||||
# if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
|
# if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
|
||||||
# if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
|
# if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
|
||||||
self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1)
|
self.proj3 = nn.utils.weight_norm(
|
||||||
|
nn.Linear(skip_channels, output_dim), dim=1)
|
||||||
|
|
||||||
self.loss_type = loss_type
|
self.loss_type = loss_type
|
||||||
self.output_dim = output_dim
|
self.output_dim = output_dim
|
||||||
|
@ -781,26 +799,28 @@ class ConditionalWaveNet(nn.Layer):
|
||||||
This is only used for computing loss when ``loss_type`` is "mog", If
|
This is only used for computing loss when ``loss_type`` is "mog", If
|
||||||
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
upsample_factors: List[int],
|
def __init__(self,
|
||||||
n_stack: int,
|
upsample_factors: List[int],
|
||||||
n_loop: int,
|
n_stack: int,
|
||||||
residual_channels: int,
|
n_loop: int,
|
||||||
|
residual_channels: int,
|
||||||
output_dim: int,
|
output_dim: int,
|
||||||
n_mels: int,
|
n_mels: int,
|
||||||
filter_size: int=2,
|
filter_size: int=2,
|
||||||
loss_type: str="mog",
|
loss_type: str="mog",
|
||||||
log_scale_min: float=-9.0):
|
log_scale_min: float=-9.0):
|
||||||
super(ConditionalWaveNet, self).__init__()
|
super(ConditionalWaveNet, self).__init__()
|
||||||
self.encoder = UpsampleNet(upsample_factors)
|
self.encoder = UpsampleNet(upsample_factors)
|
||||||
self.decoder = WaveNet(n_stack=n_stack,
|
self.decoder = WaveNet(
|
||||||
n_loop=n_loop,
|
n_stack=n_stack,
|
||||||
residual_channels=residual_channels,
|
n_loop=n_loop,
|
||||||
output_dim=output_dim,
|
residual_channels=residual_channels,
|
||||||
condition_dim=n_mels,
|
output_dim=output_dim,
|
||||||
filter_size=filter_size,
|
condition_dim=n_mels,
|
||||||
loss_type=loss_type,
|
filter_size=filter_size,
|
||||||
log_scale_min=log_scale_min)
|
loss_type=loss_type,
|
||||||
|
log_scale_min=log_scale_min)
|
||||||
|
|
||||||
def forward(self, audio, mel, audio_start):
|
def forward(self, audio, mel, audio_start):
|
||||||
"""Compute the output distribution given the mel spectrogram and the input(for teacher force training).
|
"""Compute the output distribution given the mel spectrogram and the input(for teacher force training).
|
||||||
|
@ -895,11 +915,11 @@ class ConditionalWaveNet(nn.Layer):
|
||||||
self.decoder.start_sequence()
|
self.decoder.start_sequence()
|
||||||
x_t = paddle.zeros((batch_size, ), dtype=mel.dtype)
|
x_t = paddle.zeros((batch_size, ), dtype=mel.dtype)
|
||||||
for i in trange(time_steps):
|
for i in trange(time_steps):
|
||||||
c_t = condition[:, :, i] # (B, C)
|
c_t = condition[:, :, i] # (B, C)
|
||||||
y_t = self.decoder.add_input(x_t, c_t) #(B, C)
|
y_t = self.decoder.add_input(x_t, c_t) #(B, C)
|
||||||
y_t = paddle.unsqueeze(y_t, 1)
|
y_t = paddle.unsqueeze(y_t, 1)
|
||||||
x_t = self.sample(y_t) # (B, 1)
|
x_t = self.sample(y_t) # (B, 1)
|
||||||
x_t = paddle.squeeze(x_t, 1) #(B,)
|
x_t = paddle.squeeze(x_t, 1) #(B,)
|
||||||
samples.append(x_t)
|
samples.append(x_t)
|
||||||
samples = paddle.stack(samples, -1)
|
samples = paddle.stack(samples, -1)
|
||||||
return samples
|
return samples
|
||||||
|
@ -943,16 +963,15 @@ class ConditionalWaveNet(nn.Layer):
|
||||||
ConditionalWaveNet
|
ConditionalWaveNet
|
||||||
The model built from pretrained result.
|
The model built from pretrained result.
|
||||||
"""
|
"""
|
||||||
model = cls(
|
model = cls(upsample_factors=config.model.upsample_factors,
|
||||||
upsample_factors=config.model.upsample_factors,
|
n_stack=config.model.n_stack,
|
||||||
n_stack=config.model.n_stack,
|
n_loop=config.model.n_loop,
|
||||||
n_loop=config.model.n_loop,
|
residual_channels=config.model.residual_channels,
|
||||||
residual_channels=config.model.residual_channels,
|
output_dim=config.model.output_dim,
|
||||||
output_dim=config.model.output_dim,
|
n_mels=config.data.n_mels,
|
||||||
n_mels=config.data.n_mels,
|
filter_size=config.model.filter_size,
|
||||||
filter_size=config.model.filter_size,
|
loss_type=config.model.loss_type,
|
||||||
loss_type=config.model.loss_type,
|
log_scale_min=config.model.log_scale_min)
|
||||||
log_scale_min=config.model.log_scale_min)
|
|
||||||
layer_tools.summary(model)
|
layer_tools.summary(model)
|
||||||
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
||||||
return model
|
return model
|
||||||
|
|
|
@ -1,8 +1,22 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
from paddle.nn import functional as F
|
from paddle.nn import functional as F
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
__all__ = ["quantize", "dequantize", "STFT"]
|
__all__ = ["quantize", "dequantize", "STFT"]
|
||||||
|
|
||||||
|
@ -86,6 +100,7 @@ class STFT(nn.Layer):
|
||||||
Ony ``center`` and ``reflect`` padding is supported now.
|
Ony ``center`` and ``reflect`` padding is supported now.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, n_fft, hop_length, win_length, window="hanning"):
|
def __init__(self, n_fft, hop_length, win_length, window="hanning"):
|
||||||
super(STFT, self).__init__()
|
super(STFT, self).__init__()
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
|
@ -109,7 +124,8 @@ class STFT(nn.Layer):
|
||||||
(self.n_bin, 1, 1, self.n_fft))
|
(self.n_bin, 1, 1, self.n_fft))
|
||||||
|
|
||||||
w = np.concatenate([w_real, w_imag], axis=0)
|
w = np.concatenate([w_real, w_imag], axis=0)
|
||||||
self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
|
self.weight = paddle.cast(
|
||||||
|
paddle.to_tensor(w), paddle.get_default_dtype())
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""Compute the stft transform.
|
"""Compute the stft transform.
|
||||||
|
|
|
@ -20,6 +20,7 @@ __all__ = [
|
||||||
"Conv1dBatchNorm",
|
"Conv1dBatchNorm",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class Conv1dCell(nn.Conv1D):
|
class Conv1dCell(nn.Conv1D):
|
||||||
"""A subclass of Conv1D layer, which can be used in an autoregressive
|
"""A subclass of Conv1D layer, which can be used in an autoregressive
|
||||||
decoder like an RNN cell.
|
decoder like an RNN cell.
|
||||||
|
@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer):
|
||||||
epsilon : [type], optional
|
epsilon : [type], optional
|
||||||
The epsilon of the BatchNorm1D layer, by default 1e-05
|
The epsilon of the BatchNorm1D layer, by default 1e-05
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_channels,
|
in_channels,
|
||||||
out_channels,
|
out_channels,
|
||||||
|
|
|
@ -1,6 +1,21 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
|
|
||||||
def shuffle_dim(x, axis, perm=None):
|
def shuffle_dim(x, axis, perm=None):
|
||||||
"""Permute input tensor along aixs given the permutation or randomly.
|
"""Permute input tensor along aixs given the permutation or randomly.
|
||||||
|
|
||||||
|
@ -32,7 +47,7 @@ def shuffle_dim(x, axis, perm=None):
|
||||||
perm = np.array(perm)
|
perm = np.array(perm)
|
||||||
else:
|
else:
|
||||||
perm = np.random.permutation(size)
|
perm = np.random.permutation(size)
|
||||||
|
|
||||||
perm = paddle.to_tensor(perm)
|
perm = paddle.to_tensor(perm)
|
||||||
out = paddle.gather(x, perm, axis)
|
out = paddle.gather(x, perm, axis)
|
||||||
return out
|
return out
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import numba
|
import numba
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
|
@ -5,12 +19,13 @@ from paddle import nn
|
||||||
from paddle.nn import functional as F
|
from paddle.nn import functional as F
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"weighted_mean",
|
"weighted_mean",
|
||||||
"masked_l1_loss",
|
"masked_l1_loss",
|
||||||
"masked_softmax_with_cross_entropy",
|
"masked_softmax_with_cross_entropy",
|
||||||
"diagonal_loss",
|
"diagonal_loss",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def weighted_mean(input, weight):
|
def weighted_mean(input, weight):
|
||||||
"""Weighted mean. It can also be used as masked mean.
|
"""Weighted mean. It can also be used as masked mean.
|
||||||
|
|
||||||
|
@ -88,12 +103,11 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
def diagonal_loss(
|
def diagonal_loss(attentions,
|
||||||
attentions,
|
input_lengths,
|
||||||
input_lengths,
|
target_lengths,
|
||||||
target_lengths,
|
g=0.2,
|
||||||
g=0.2,
|
multihead=False):
|
||||||
multihead=False):
|
|
||||||
"""A metric to evaluate how diagonal a attention distribution is.
|
"""A metric to evaluate how diagonal a attention distribution is.
|
||||||
|
|
||||||
It is computed for batch attention distributions. For each attention
|
It is computed for batch attention distributions. For each attention
|
||||||
|
@ -133,6 +147,7 @@ def diagonal_loss(
|
||||||
else:
|
else:
|
||||||
return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
|
return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
|
||||||
|
|
||||||
|
|
||||||
@numba.jit(nopython=True)
|
@numba.jit(nopython=True)
|
||||||
def guided_attention(N, max_N, T, max_T, g):
|
def guided_attention(N, max_N, T, max_T, g):
|
||||||
W = np.zeros((max_T, max_N), dtype=np.float32)
|
W = np.zeros((max_T, max_N), dtype=np.float32)
|
||||||
|
@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g):
|
||||||
# (T_dec, T_enc)
|
# (T_dec, T_enc)
|
||||||
return W
|
return W
|
||||||
|
|
||||||
|
|
||||||
def guided_attentions(input_lengths, target_lengths, g=0.2):
|
def guided_attentions(input_lengths, target_lengths, g=0.2):
|
||||||
B = len(input_lengths)
|
B = len(input_lengths)
|
||||||
max_input_len = input_lengths.max()
|
max_input_len = input_lengths.max()
|
||||||
|
@ -151,4 +167,4 @@ def guided_attentions(input_lengths, target_lengths, g=0.2):
|
||||||
W[b] = guided_attention(input_lengths[b], max_input_len,
|
W[b] = guided_attention(input_lengths[b], max_input_len,
|
||||||
target_lengths[b], max_target_len, g)
|
target_lengths[b], max_target_len, g)
|
||||||
# (B, T_dec, T_enc)
|
# (B, T_dec, T_enc)
|
||||||
return W
|
return W
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle.fluid.layers import sequence_mask
|
from paddle.fluid.layers import sequence_mask
|
||||||
|
|
||||||
|
@ -8,6 +22,7 @@ __all__ = [
|
||||||
"future_mask",
|
"future_mask",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def id_mask(input, padding_index=0, dtype="bool"):
|
def id_mask(input, padding_index=0, dtype="bool"):
|
||||||
"""Generate mask with input ids.
|
"""Generate mask with input ids.
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
|
@ -5,6 +19,7 @@ from paddle.nn import functional as F
|
||||||
|
|
||||||
__all__ = ["positional_encoding"]
|
__all__ = ["positional_encoding"]
|
||||||
|
|
||||||
|
|
||||||
def positional_encoding(start_index, length, size, dtype=None):
|
def positional_encoding(start_index, length, size, dtype=None):
|
||||||
r"""Generate standard positional encoding matrix.
|
r"""Generate standard positional encoding matrix.
|
||||||
|
|
||||||
|
@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None):
|
||||||
dtype = dtype or paddle.get_default_dtype()
|
dtype = dtype or paddle.get_default_dtype()
|
||||||
channel = np.arange(0, size, 2)
|
channel = np.arange(0, size, 2)
|
||||||
index = np.arange(start_index, start_index + length, 1)
|
index = np.arange(start_index, start_index + length, 1)
|
||||||
p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
|
p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
|
||||||
encodings = np.zeros([length, size])
|
encodings = np.zeros([length, size])
|
||||||
encodings[:, 0::2] = np.sin(p)
|
encodings[:, 0::2] = np.sin(p)
|
||||||
encodings[:, 1::2] = np.cos(p)
|
encodings[:, 1::2] = np.cos(p)
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
@ -12,6 +26,7 @@ __all__ = [
|
||||||
"TransformerDecoderLayer",
|
"TransformerDecoderLayer",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class PositionwiseFFN(nn.Layer):
|
class PositionwiseFFN(nn.Layer):
|
||||||
"""A faithful implementation of Position-wise Feed-Forward Network
|
"""A faithful implementation of Position-wise Feed-Forward Network
|
||||||
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
||||||
|
@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer):
|
||||||
The probability of the Dropout applied to the output of the first
|
The probability of the Dropout applied to the output of the first
|
||||||
layer, by default 0.
|
layer, by default 0.
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
input_size: int,
|
def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
|
||||||
hidden_size: int,
|
|
||||||
dropout=0.0):
|
|
||||||
super(PositionwiseFFN, self).__init__()
|
super(PositionwiseFFN, self).__init__()
|
||||||
self.linear1 = nn.Linear(input_size, hidden_size)
|
self.linear1 = nn.Linear(input_size, hidden_size)
|
||||||
self.linear2 = nn.Linear(hidden_size, input_size)
|
self.linear2 = nn.Linear(hidden_size, input_size)
|
||||||
|
@ -86,16 +99,17 @@ class TransformerEncoderLayer(nn.Layer):
|
||||||
------
|
------
|
||||||
It uses the PostLN (post layer norm) scheme.
|
It uses the PostLN (post layer norm) scheme.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
||||||
super(TransformerEncoderLayer, self).__init__()
|
super(TransformerEncoderLayer, self).__init__()
|
||||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
||||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||||
|
|
||||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
||||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||||
|
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
|
|
||||||
def forward(self, x, mask):
|
def forward(self, x, mask):
|
||||||
"""Forward pass of TransformerEncoderLayer.
|
"""Forward pass of TransformerEncoderLayer.
|
||||||
|
|
||||||
|
@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer):
|
||||||
"""
|
"""
|
||||||
context_vector, attn_weights = self.self_mha(x, x, x, mask)
|
context_vector, attn_weights = self.self_mha(x, x, x, mask)
|
||||||
x = self.layer_norm1(
|
x = self.layer_norm1(
|
||||||
F.dropout(x + context_vector,
|
F.dropout(
|
||||||
self.dropout,
|
x + context_vector, self.dropout, training=self.training))
|
||||||
training=self.training))
|
|
||||||
|
|
||||||
x = self.layer_norm2(
|
x = self.layer_norm2(
|
||||||
F.dropout(x + self.ffn(x),
|
F.dropout(
|
||||||
self.dropout,
|
x + self.ffn(x), self.dropout, training=self.training))
|
||||||
training=self.training))
|
|
||||||
return x, attn_weights
|
return x, attn_weights
|
||||||
|
|
||||||
|
|
||||||
|
@ -155,19 +167,20 @@ class TransformerDecoderLayer(nn.Layer):
|
||||||
------
|
------
|
||||||
It uses the PostLN (post layer norm) scheme.
|
It uses the PostLN (post layer norm) scheme.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
||||||
super(TransformerDecoderLayer, self).__init__()
|
super(TransformerDecoderLayer, self).__init__()
|
||||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
||||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||||
|
|
||||||
self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
||||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||||
|
|
||||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
||||||
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
|
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||||
|
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
|
|
||||||
def forward(self, q, k, v, encoder_mask, decoder_mask):
|
def forward(self, q, k, v, encoder_mask, decoder_mask):
|
||||||
"""Forward pass of TransformerEncoderLayer.
|
"""Forward pass of TransformerEncoderLayer.
|
||||||
|
|
||||||
|
@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer):
|
||||||
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
|
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
|
||||||
Decoder-encoder cross attention.
|
Decoder-encoder cross attention.
|
||||||
"""
|
"""
|
||||||
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
|
context_vector, self_attn_weights = self.self_mha(q, q, q,
|
||||||
|
decoder_mask)
|
||||||
q = self.layer_norm1(
|
q = self.layer_norm1(
|
||||||
F.dropout(q + context_vector,
|
F.dropout(
|
||||||
self.dropout,
|
q + context_vector, self.dropout, training=self.training))
|
||||||
training=self.training))
|
|
||||||
|
context_vector, cross_attn_weights = self.cross_mha(q, k, v,
|
||||||
context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask)
|
encoder_mask)
|
||||||
q = self.layer_norm2(
|
q = self.layer_norm2(
|
||||||
F.dropout(q + context_vector,
|
F.dropout(
|
||||||
self.dropout,
|
q + context_vector, self.dropout, training=self.training))
|
||||||
training=self.training))
|
|
||||||
|
|
||||||
q = self.layer_norm3(
|
q = self.layer_norm3(
|
||||||
F.dropout(q + self.ffn(q),
|
F.dropout(
|
||||||
self.dropout,
|
q + self.ffn(q), self.dropout, training=self.training))
|
||||||
training=self.training))
|
|
||||||
return q, self_attn_weights, cross_attn_weights
|
return q, self_attn_weights, cross_attn_weights
|
||||||
|
|
|
@ -1,2 +1,16 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
from parakeet.training.cli import *
|
from parakeet.training.cli import *
|
||||||
from parakeet.training.experiment import *
|
from parakeet.training.experiment import *
|
||||||
|
|
|
@ -1,5 +1,20 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def default_argument_parser():
|
def default_argument_parser():
|
||||||
r"""A simple yet genral argument parser for experiments with parakeet.
|
r"""A simple yet genral argument parser for experiments with parakeet.
|
||||||
|
|
||||||
|
@ -46,5 +61,5 @@ def default_argument_parser():
|
||||||
# overwrite extra config and default config
|
# overwrite extra config and default config
|
||||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
||||||
# yapd: enable
|
# yapd: enable
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
|
@ -1,12 +1,26 @@
|
||||||
from yacs.config import CfgNode
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
_C = CfgNode(
|
_C = CfgNode(
|
||||||
dict(
|
dict(
|
||||||
valid_interval=1000, # validation
|
valid_interval=1000, # validation
|
||||||
save_interval=10000, # checkpoint
|
save_interval=10000, # checkpoint
|
||||||
max_iteration=900000, # max iteration to train
|
max_iteration=900000, # max iteration to train
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
def get_default_training_config():
|
def get_default_training_config():
|
||||||
return _C.clone()
|
return _C.clone()
|
||||||
|
|
|
@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools
|
||||||
|
|
||||||
__all__ = ["ExperimentBase"]
|
__all__ = ["ExperimentBase"]
|
||||||
|
|
||||||
|
|
||||||
class ExperimentBase(object):
|
class ExperimentBase(object):
|
||||||
"""
|
"""
|
||||||
An experiment template in order to structure the training code and take
|
An experiment template in order to structure the training code and take
|
||||||
|
|
|
@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
|
||||||
|
|
||||||
return iteration
|
return iteration
|
||||||
|
|
||||||
|
|
||||||
def _save_checkpoint(checkpoint_dir: str, iteration: int):
|
def _save_checkpoint(checkpoint_dir: str, iteration: int):
|
||||||
"""Save the iteration number of the latest model to be checkpointed.
|
"""Save the iteration number of the latest model to be checkpointed.
|
||||||
|
|
||||||
|
@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
|
||||||
with open(checkpoint_record, "wt") as handle:
|
with open(checkpoint_record, "wt") as handle:
|
||||||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||||
|
|
||||||
|
|
||||||
def load_parameters(model,
|
def load_parameters(model,
|
||||||
optimizer=None,
|
optimizer=None,
|
||||||
checkpoint_dir=None,
|
checkpoint_dir=None,
|
||||||
|
@ -97,18 +99,19 @@ def load_parameters(model,
|
||||||
params_path = checkpoint_path + ".pdparams"
|
params_path = checkpoint_path + ".pdparams"
|
||||||
model_dict = paddle.load(params_path)
|
model_dict = paddle.load(params_path)
|
||||||
model.set_state_dict(model_dict)
|
model.set_state_dict(model_dict)
|
||||||
print("[checkpoint] Rank {}: loaded model from {}".format(
|
print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
|
||||||
local_rank, params_path))
|
params_path))
|
||||||
|
|
||||||
optimizer_path = checkpoint_path + ".pdopt"
|
optimizer_path = checkpoint_path + ".pdopt"
|
||||||
if optimizer and os.path.isfile(optimizer_path):
|
if optimizer and os.path.isfile(optimizer_path):
|
||||||
optimizer_dict = paddle.load(optimizer_path)
|
optimizer_dict = paddle.load(optimizer_path)
|
||||||
optimizer.set_state_dict(optimizer_dict)
|
optimizer.set_state_dict(optimizer_dict)
|
||||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".
|
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
||||||
format(local_rank, optimizer_path))
|
local_rank, optimizer_path))
|
||||||
|
|
||||||
return iteration
|
return iteration
|
||||||
|
|
||||||
|
|
||||||
@mp_tools.rank_zero_only
|
@mp_tools.rank_zero_only
|
||||||
def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||||
"""Checkpoint the latest trained model parameters.
|
"""Checkpoint the latest trained model parameters.
|
||||||
|
@ -124,7 +127,7 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
|
checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))
|
||||||
|
|
||||||
model_dict = model.state_dict()
|
model_dict = model.state_dict()
|
||||||
params_path = checkpoint_path + ".pdparams"
|
params_path = checkpoint_path + ".pdparams"
|
||||||
paddle.save(model_dict, params_path)
|
paddle.save(model_dict, params_path)
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from paddle.framework import core
|
from paddle.framework import core
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,7 @@ def summary(layer: nn.Layer):
|
||||||
print("layer has {} parameters, {} elements.".format(num_params,
|
print("layer has {} parameters, {} elements.".format(num_params,
|
||||||
num_elements))
|
num_elements))
|
||||||
|
|
||||||
|
|
||||||
def gradient_norm(layer: nn.Layer):
|
def gradient_norm(layer: nn.Layer):
|
||||||
grad_norm_dict = {}
|
grad_norm_dict = {}
|
||||||
for name, param in layer.state_dict().items():
|
for name, param in layer.state_dict().items():
|
||||||
|
@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer):
|
||||||
grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
|
grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
|
||||||
return grad_norm_dict
|
return grad_norm_dict
|
||||||
|
|
||||||
|
|
||||||
def recursively_remove_weight_norm(layer: nn.Layer):
|
def recursively_remove_weight_norm(layer: nn.Layer):
|
||||||
for layer in layer.sublayers():
|
for layer in layer.sublayers():
|
||||||
try:
|
try:
|
||||||
|
@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer):
|
||||||
# ther is not weight norm hoom in this layer
|
# ther is not weight norm hoom in this layer
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def freeze(layer: nn.Layer):
|
def freeze(layer: nn.Layer):
|
||||||
for param in layer.parameters():
|
for param in layer.parameters():
|
||||||
param.trainable = False
|
param.trainable = False
|
||||||
|
|
||||||
|
|
||||||
def unfreeze(layer: nn.Layer):
|
def unfreeze(layer: nn.Layer):
|
||||||
for param in layer.parameters():
|
for param in layer.parameters():
|
||||||
param.trainable = True
|
param.trainable = True
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import distributed as dist
|
from paddle import distributed as dist
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
@ -11,11 +25,8 @@ def rank_zero_only(func):
|
||||||
@wraps(func)
|
@wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
if local_rank != 0:
|
if local_rank != 0:
|
||||||
return
|
return
|
||||||
result = func(*args, **kwargs)
|
result = func(*args, **kwargs)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
__all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
|
__all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
|
||||||
|
@ -24,7 +38,7 @@ class PieceWise(SchedulerBase):
|
||||||
self.xs = [item[0] for item in anchors]
|
self.xs = [item[0] for item in anchors]
|
||||||
self.ys = [item[1] for item in anchors]
|
self.ys = [item[1] for item in anchors]
|
||||||
self.num_anchors = len(self.xs)
|
self.num_anchors = len(self.xs)
|
||||||
|
|
||||||
def __call__(self, step):
|
def __call__(self, step):
|
||||||
i = 0
|
i = 0
|
||||||
for x in self.xs:
|
for x in self.xs:
|
||||||
|
@ -34,8 +48,8 @@ class PieceWise(SchedulerBase):
|
||||||
return self.ys[0]
|
return self.ys[0]
|
||||||
if i == self.num_anchors:
|
if i == self.num_anchors:
|
||||||
return self.ys[-1]
|
return self.ys[-1]
|
||||||
k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1])
|
k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1])
|
||||||
out = self.ys[i-1] + (step - self.xs[i-1]) * k
|
out = self.ys[i - 1] + (step - self.xs[i - 1]) * k
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,7 +61,7 @@ class StepWise(SchedulerBase):
|
||||||
self.xs = [item[0] for item in anchors]
|
self.xs = [item[0] for item in anchors]
|
||||||
self.ys = [item[1] for item in anchors]
|
self.ys = [item[1] for item in anchors]
|
||||||
self.num_anchors = len(self.xs)
|
self.num_anchors = len(self.xs)
|
||||||
|
|
||||||
def __call__(self, step):
|
def __call__(self, step):
|
||||||
i = 0
|
i = 0
|
||||||
for x in self.xs:
|
for x in self.xs:
|
||||||
|
@ -58,5 +72,4 @@ class StepWise(SchedulerBase):
|
||||||
return self.ys[-1]
|
return self.ys[-1]
|
||||||
if i == 0:
|
if i == 0:
|
||||||
return self.ys[0]
|
return self.ys[0]
|
||||||
return self.ys[i-1]
|
return self.ys[i - 1]
|
||||||
|
|
||||||
|
|
16
setup.py
16
setup.py
|
@ -48,7 +48,6 @@ setup_info = dict(
|
||||||
description='Speech synthesis tools and models based on Paddlepaddle',
|
description='Speech synthesis tools and models based on Paddlepaddle',
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
license='Apache 2',
|
license='Apache 2',
|
||||||
|
|
||||||
python_requires='>=3.6',
|
python_requires='>=3.6',
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'numpy',
|
'numpy',
|
||||||
|
@ -64,30 +63,25 @@ setup_info = dict(
|
||||||
'scipy',
|
'scipy',
|
||||||
'pandas',
|
'pandas',
|
||||||
'sox',
|
'sox',
|
||||||
'opencc',
|
# 'opencc',
|
||||||
'soundfile',
|
'soundfile',
|
||||||
'g2p_en',
|
'g2p_en',
|
||||||
'g2pM',
|
'g2pM',
|
||||||
'yacs',
|
'yacs',
|
||||||
'tensorboardX',
|
'tensorboardX',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
|
||||||
'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
|
|
||||||
},
|
|
||||||
|
|
||||||
# Package info
|
# Package info
|
||||||
packages=find_packages(exclude=('tests', 'tests.*')),
|
packages=find_packages(exclude=('tests', 'tests.*')),
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
|
classifiers=[
|
||||||
classifiers = [
|
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 4 - Beta',
|
||||||
'Intended Audience :: Developers',
|
'Intended Audience :: Developers',
|
||||||
'Topic :: Scientific/Engineering :: Artificial Intelligence'
|
'Topic :: Scientific/Engineering :: Artificial Intelligence'
|
||||||
'License :: OSI Approved :: Apache2 License',
|
'License :: OSI Approved :: Apache2 License',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
'Programming Language :: Python :: 3.7',
|
'Programming Language :: Python :: 3.7',
|
||||||
],
|
], )
|
||||||
|
|
||||||
)
|
|
||||||
|
|
||||||
setup(**setup_info)
|
setup(**setup_info)
|
||||||
|
|
|
@ -1,101 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.modules import attention as attn
|
|
||||||
|
|
||||||
class TestScaledDotProductAttention(unittest.TestCase):
|
|
||||||
def test_without_mask(self):
|
|
||||||
x = paddle.randn([4, 16, 8])
|
|
||||||
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
|
|
||||||
assert(list(context_vector.shape) == [4, 16, 8])
|
|
||||||
assert(list(attention_weights.shape) == [4, 16, 16])
|
|
||||||
|
|
||||||
def test_with_mask(self):
|
|
||||||
x = paddle.randn([4, 16, 8])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([16, 15, 13, 14]), dtype=x.dtype)
|
|
||||||
mask = mask.unsqueeze(1) # unsqueeze for the decoder time steps
|
|
||||||
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x, mask)
|
|
||||||
assert(list(context_vector.shape) == [4, 16, 8])
|
|
||||||
assert(list(attention_weights.shape) == [4, 16, 16])
|
|
||||||
|
|
||||||
def test_4d(self):
|
|
||||||
x = paddle.randn([4, 6, 16, 8])
|
|
||||||
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
|
|
||||||
assert(list(context_vector.shape) == [4, 6, 16, 8])
|
|
||||||
assert(list(attention_weights.shape) == [4, 6, 16, 16])
|
|
||||||
|
|
||||||
|
|
||||||
class TestMonoheadAttention(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = attn.MonoheadAttention(6, 0.1)
|
|
||||||
q = paddle.randn([4, 18, 6])
|
|
||||||
k = paddle.randn([4, 12, 6])
|
|
||||||
v = paddle.randn([4, 12, 6])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
|
|
||||||
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
|
|
||||||
context_vector, attn_weights = net(q, k, v, mask)
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 18, 6))
|
|
||||||
self.assertTupleEqual(attn_weights.numpy().shape, (4, 18, 12))
|
|
||||||
|
|
||||||
|
|
||||||
class TestDropHead(unittest.TestCase):
|
|
||||||
def test_drop(self):
|
|
||||||
x = paddle.randn([4, 6, 16, 8])
|
|
||||||
out = attn.drop_head(x, 2, training=True)
|
|
||||||
# drop 2 head from 6 at all positions
|
|
||||||
np.testing.assert_allclose(np.sum(out.numpy() == 0., axis=1), 2)
|
|
||||||
|
|
||||||
def test_drop_all(self):
|
|
||||||
x = paddle.randn([4, 6, 16, 8])
|
|
||||||
out = attn.drop_head(x, 6, training=True)
|
|
||||||
np.testing.assert_allclose(np.sum(out.numpy()), 0)
|
|
||||||
|
|
||||||
def test_eval(self):
|
|
||||||
x = paddle.randn([4, 6, 16, 8])
|
|
||||||
out = attn.drop_head(x, 6, training=False)
|
|
||||||
self.assertIs(x, out)
|
|
||||||
|
|
||||||
|
|
||||||
class TestMultiheadAttention(unittest.TestCase):
|
|
||||||
def __init__(self, methodName="test_io", same_qk=True):
|
|
||||||
super(TestMultiheadAttention, self).__init__(methodName)
|
|
||||||
self.same_qk = same_qk
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
if self.same_qk:
|
|
||||||
net = attn.MultiheadAttention(64, 8, dropout=0.3)
|
|
||||||
else:
|
|
||||||
net = attn.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
|
|
||||||
self.net =net
|
|
||||||
|
|
||||||
def test_io(self):
|
|
||||||
q = paddle.randn([4, 12, 64])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
|
|
||||||
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
|
|
||||||
context_vector, attention_weights = self.net(q, q, q, mask)
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
|
|
||||||
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
|
|
||||||
|
|
||||||
|
|
||||||
def load_tests(loader, standard_tests, pattern):
|
|
||||||
suite = unittest.TestSuite()
|
|
||||||
suite.addTest(TestScaledDotProductAttention("test_without_mask"))
|
|
||||||
suite.addTest(TestScaledDotProductAttention("test_with_mask"))
|
|
||||||
suite.addTest(TestScaledDotProductAttention("test_4d"))
|
|
||||||
|
|
||||||
suite.addTest(TestDropHead("test_drop"))
|
|
||||||
suite.addTest(TestDropHead("test_drop_all"))
|
|
||||||
suite.addTest(TestDropHead("test_eval"))
|
|
||||||
|
|
||||||
suite.addTest(TestMonoheadAttention("test_io"))
|
|
||||||
|
|
||||||
suite.addTest(TestMultiheadAttention("test_io", same_qk=True))
|
|
||||||
suite.addTest(TestMultiheadAttention("test_io", same_qk=False))
|
|
||||||
|
|
||||||
return suite
|
|
|
@ -1,34 +0,0 @@
|
||||||
import unittest
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
from parakeet.modules import cbhg
|
|
||||||
|
|
||||||
|
|
||||||
class TestHighway(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = cbhg.Highway(4)
|
|
||||||
x = paddle.randn([2, 12, 4])
|
|
||||||
y = net(x)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (2, 12, 4))
|
|
||||||
|
|
||||||
|
|
||||||
class TestCBHG(unittest.TestCase):
|
|
||||||
def __init__(self, methodName="runTest", ):
|
|
||||||
super(TestCBHG, self).__init__(methodName)
|
|
||||||
|
|
||||||
def test_io(self):
|
|
||||||
self.net = cbhg.CBHG(64, 32, 16,
|
|
||||||
projection_channels=[64, 128],
|
|
||||||
num_highways=4, highway_features=128,
|
|
||||||
gru_features=64)
|
|
||||||
x = paddle.randn([4, 64, 32])
|
|
||||||
y = self.net(x)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 128))
|
|
||||||
|
|
||||||
def load_tests(loader, standard_tests, pattern):
|
|
||||||
suite = unittest.TestSuite()
|
|
||||||
|
|
||||||
suite.addTest(TestHighway("test_io"))
|
|
||||||
suite.addTest(TestCBHG("test_io"))
|
|
||||||
return suite
|
|
|
@ -1,43 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.models import clarinet
|
|
||||||
from parakeet.modules import stft
|
|
||||||
|
|
||||||
class TestParallelWaveNet(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = clarinet.ParallelWaveNet([8, 8, 8], [1, 1, 1], 16, 12, 2)
|
|
||||||
x = paddle.randn([4, 6073])
|
|
||||||
condition = paddle.randn([4, 12, 6073])
|
|
||||||
z, out_mu, out_log_std = net(x, condition)
|
|
||||||
self.assertTupleEqual(z.numpy().shape, (4, 6073))
|
|
||||||
self.assertTupleEqual(out_mu.numpy().shape, (4, 6073))
|
|
||||||
self.assertTupleEqual(out_log_std.numpy().shape, (4, 6073))
|
|
||||||
|
|
||||||
|
|
||||||
class TestClariNet(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
encoder = clarinet.UpsampleNet([2, 2])
|
|
||||||
teacher = clarinet.WaveNet(8, 3, 16, 3, 12, 2, "mog", -9.0)
|
|
||||||
student = clarinet.ParallelWaveNet([8, 8, 8, 8, 8, 8], [1, 1, 1, 1, 1, 1], 16, 12, 2)
|
|
||||||
stft_module = stft.STFT(16, 4, 8)
|
|
||||||
net = clarinet.Clarinet(encoder, teacher, student, stft_module, -6.0, lmd=4)
|
|
||||||
print("context size is: ", teacher.context_size)
|
|
||||||
self.net = net
|
|
||||||
|
|
||||||
def test_io(self):
|
|
||||||
audio = paddle.randn([4, 1366])
|
|
||||||
mel = paddle.randn([4, 12, 512]) # 512 * 4 =2048
|
|
||||||
audio_start = paddle.zeros([4], dtype="int64")
|
|
||||||
loss = self.net(audio, mel, audio_start, clip_kl=True)
|
|
||||||
loss["loss"].numpy()
|
|
||||||
|
|
||||||
def test_synthesis(self):
|
|
||||||
mel = paddle.randn([4, 12, 512]) # 64 = 246 / 4
|
|
||||||
out = self.net.synthesis(mel)
|
|
||||||
self.assertTupleEqual(out.numpy().shape, (4, 2048))
|
|
||||||
|
|
|
@ -1,33 +0,0 @@
|
||||||
import unittest
|
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
|
|
||||||
from parakeet.modules import connections as conn
|
|
||||||
|
|
||||||
class TestPreLayerNormWrapper(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = nn.Linear(8, 8)
|
|
||||||
net = conn.PreLayerNormWrapper(net, 8)
|
|
||||||
x = paddle.randn([4, 8])
|
|
||||||
y = net(x)
|
|
||||||
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestPostLayerNormWrapper(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = nn.Linear(8, 8)
|
|
||||||
net = conn.PostLayerNormWrapper(net, 8)
|
|
||||||
x = paddle.randn([4, 8])
|
|
||||||
y = net(x)
|
|
||||||
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestResidualWrapper(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = nn.Linear(8, 8)
|
|
||||||
net = conn.ResidualWrapper(net)
|
|
||||||
x = paddle.randn([4, 8])
|
|
||||||
y = net(x)
|
|
||||||
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
|
|
|
@ -1,67 +0,0 @@
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from parakeet.modules import conv
|
|
||||||
|
|
||||||
class TestConv1dCell(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
self.net = conv.Conv1dCell(4, 6, 5, dilation=2)
|
|
||||||
|
|
||||||
def forward_incremental(self, x):
|
|
||||||
outs = []
|
|
||||||
self.net.start_sequence()
|
|
||||||
with paddle.no_grad():
|
|
||||||
for i in range(x.shape[-1]):
|
|
||||||
xt = x[:, :, i]
|
|
||||||
yt = self.net.add_input(xt)
|
|
||||||
outs.append(yt)
|
|
||||||
y2 = paddle.stack(outs, axis=-1)
|
|
||||||
return y2
|
|
||||||
|
|
||||||
def test_equality(self):
|
|
||||||
x = paddle.randn([2, 4, 16])
|
|
||||||
y1 = self.net(x)
|
|
||||||
|
|
||||||
self.net.eval()
|
|
||||||
y2 = self.forward_incremental(x)
|
|
||||||
|
|
||||||
np.testing.assert_allclose(y2.numpy(), y1.numpy())
|
|
||||||
|
|
||||||
|
|
||||||
class TestConv1dBatchNorm(unittest.TestCase):
|
|
||||||
def __init__(self, methodName="runTest", causal=False, channel_last=False):
|
|
||||||
super(TestConv1dBatchNorm, self).__init__(methodName)
|
|
||||||
self.causal = causal
|
|
||||||
self.channel_last = channel_last
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
k = 5
|
|
||||||
paddding = (k - 1, 0) if self.causal else ((k-1) // 2, k //2)
|
|
||||||
self.net = conv.Conv1dBatchNorm(4, 6, (k,), 1, padding=paddding,
|
|
||||||
data_format="NLC" if self.channel_last else "NCL")
|
|
||||||
|
|
||||||
def test_input_output(self):
|
|
||||||
x = paddle.randn([4, 16, 4]) if self.channel_last else paddle.randn([4, 4, 16])
|
|
||||||
out = self.net(x)
|
|
||||||
out_np = out.numpy()
|
|
||||||
if self.channel_last:
|
|
||||||
self.assertTupleEqual(out_np.shape, (4, 16, 6))
|
|
||||||
else:
|
|
||||||
self.assertTupleEqual(out_np.shape, (4, 6, 16))
|
|
||||||
|
|
||||||
def runTest(self):
|
|
||||||
self.test_input_output()
|
|
||||||
|
|
||||||
|
|
||||||
def load_tests(loader, standard_tests, pattern):
|
|
||||||
suite = unittest.TestSuite()
|
|
||||||
suite.addTest(TestConv1dBatchNorm("runTest", True, True))
|
|
||||||
suite.addTest(TestConv1dBatchNorm("runTest", False, False))
|
|
||||||
suite.addTest(TestConv1dBatchNorm("runTest", True, False))
|
|
||||||
suite.addTest(TestConv1dBatchNorm("runTest", False, True))
|
|
||||||
suite.addTest(TestConv1dCell("test_equality"))
|
|
||||||
|
|
||||||
return suite
|
|
|
@ -1,122 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
from paddle import io
|
|
||||||
from parakeet import data
|
|
||||||
|
|
||||||
class MyDataset(io.Dataset):
|
|
||||||
def __init__(self, size):
|
|
||||||
self._data = np.random.randn(size, 6)
|
|
||||||
|
|
||||||
def __getitem__(self, i):
|
|
||||||
return self._data[i]
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self._data.shape[0]
|
|
||||||
|
|
||||||
|
|
||||||
class TestTransformDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset = MyDataset(20)
|
|
||||||
dataset = data.TransformDataset(dataset, lambda x: np.abs(x))
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("TransformDataset")
|
|
||||||
for batch, in dataloader:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestChainDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset1 = MyDataset(20)
|
|
||||||
dataset2 = MyDataset(40)
|
|
||||||
dataset = data.ChainDataset(dataset1, dataset2)
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("ChainDataset")
|
|
||||||
for batch, in dataloader:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestTupleDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset1 = MyDataset(20)
|
|
||||||
dataset2 = MyDataset(20)
|
|
||||||
dataset = data.TupleDataset(dataset1, dataset2)
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("TupleDataset")
|
|
||||||
for field1, field2 in dataloader:
|
|
||||||
print(type(field1), field1.dtype, field1.shape)
|
|
||||||
print(type(field2), field2.dtype, field2.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestDictDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset1 = MyDataset(20)
|
|
||||||
dataset2 = MyDataset(20)
|
|
||||||
dataset = data.DictDataset(field1=dataset1, field2=dataset2)
|
|
||||||
def collate_fn(examples):
|
|
||||||
examples_tuples = []
|
|
||||||
for example in examples:
|
|
||||||
examples_tuples.append(example.values())
|
|
||||||
return paddle.fluid.dataloader.dataloader_iter.default_collate_fn(examples_tuples)
|
|
||||||
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1, collate_fn=collate_fn)
|
|
||||||
print("DictDataset")
|
|
||||||
for field1, field2 in dataloader:
|
|
||||||
print(type(field1), field1.dtype, field1.shape)
|
|
||||||
print(type(field2), field2.dtype, field2.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestSliceDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset = MyDataset(40)
|
|
||||||
dataset = data.SliceDataset(dataset, 0, 20)
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("SliceDataset")
|
|
||||||
for batch, in dataloader:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestSplit(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset = MyDataset(40)
|
|
||||||
train, valid = data.split(dataset, 10)
|
|
||||||
dataloader1 = io.DataLoader(train, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
dataloader2 = io.DataLoader(valid, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("First Dataset")
|
|
||||||
for batch, in dataloader1:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
print("Second Dataset")
|
|
||||||
for batch, in dataloader2:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestSubsetDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset = MyDataset(40)
|
|
||||||
indices = np.random.choice(np.arange(40), [20], replace=False).tolist()
|
|
||||||
dataset = data.SubsetDataset(dataset, indices)
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("SubsetDataset")
|
|
||||||
for batch, in dataloader:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFilterDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset = MyDataset(40)
|
|
||||||
dataset = data.FilterDataset(dataset, lambda x: np.mean(x)> 0.3)
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("FilterDataset")
|
|
||||||
for batch, in dataloader:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
||||||
|
|
||||||
|
|
||||||
class TestCacheDataset(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
dataset = MyDataset(40)
|
|
||||||
dataset = data.CacheDataset(dataset)
|
|
||||||
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
|
|
||||||
print("CacheDataset")
|
|
||||||
for batch, in dataloader:
|
|
||||||
print(type(batch), batch.dtype, batch.shape)
|
|
|
@ -1,107 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import unittest
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.models import deepvoice3 as dv3
|
|
||||||
|
|
||||||
class TestConvBlock(unittest.TestCase):
|
|
||||||
def test_io_causal(self):
|
|
||||||
net = dv3.ConvBlock(6, 5, True, True, 8, 0.9)
|
|
||||||
x = paddle.randn([4, 32, 6])
|
|
||||||
condition = paddle.randn([4, 8])
|
|
||||||
# TODO(chenfeiyu): to report an issue on default data type
|
|
||||||
padding = paddle.zeros([4, 4, 6], dtype=x.dtype)
|
|
||||||
y = net.forward(x, condition, padding)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
|
|
||||||
|
|
||||||
def test_io_non_causal(self):
|
|
||||||
net = dv3.ConvBlock(6, 5, False, True, 8, 0.9)
|
|
||||||
x = paddle.randn([4, 32, 6])
|
|
||||||
condition = paddle.randn([4, 8])
|
|
||||||
y = net.forward(x, condition)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
|
|
||||||
|
|
||||||
|
|
||||||
class TestAffineBlock1(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = dv3.AffineBlock1(6, 16, True, 8)
|
|
||||||
x = paddle.randn([4, 32, 6])
|
|
||||||
condition = paddle.randn([4, 8])
|
|
||||||
y = net(x, condition)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
|
|
||||||
|
|
||||||
|
|
||||||
class TestAffineBlock2(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = dv3.AffineBlock2(6, 16, True, 8)
|
|
||||||
x = paddle.randn([4, 32, 6])
|
|
||||||
condition = paddle.randn([4, 8])
|
|
||||||
y = net(x, condition)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
|
|
||||||
|
|
||||||
|
|
||||||
class TestEncoder(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = dv3.Encoder(5, 8, 16, 5, True, 6)
|
|
||||||
x = paddle.randn([4, 32, 8])
|
|
||||||
condition = paddle.randn([4, 6])
|
|
||||||
keys, values = net(x, condition)
|
|
||||||
self.assertTupleEqual(keys.numpy().shape, (4, 32, 8))
|
|
||||||
self.assertTupleEqual(values.numpy().shape, (4, 32, 8))
|
|
||||||
|
|
||||||
|
|
||||||
class TestAttentionBlock(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
|
|
||||||
q = paddle.randn([4, 32, 6])
|
|
||||||
k = paddle.randn([4, 24, 6])
|
|
||||||
v = paddle.randn([4, 24, 6])
|
|
||||||
lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
|
|
||||||
condition = paddle.randn([4, 8])
|
|
||||||
context_vector, attention_weight = net(q, k, v, lengths, condition, 0)
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
|
|
||||||
self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
|
|
||||||
|
|
||||||
def test_io_with_previous_attn(self):
|
|
||||||
net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
|
|
||||||
q = paddle.randn([4, 32, 6])
|
|
||||||
k = paddle.randn([4, 24, 6])
|
|
||||||
v = paddle.randn([4, 24, 6])
|
|
||||||
lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
|
|
||||||
condition = paddle.randn([4, 8])
|
|
||||||
prev_attn_weight = paddle.randn([4, 32, 16])
|
|
||||||
|
|
||||||
context_vector, attention_weight = net(
|
|
||||||
q, k, v, lengths, condition, 0,
|
|
||||||
force_monotonic=True, prev_coeffs=prev_attn_weight, window=(0, 4))
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
|
|
||||||
self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
|
|
||||||
|
|
||||||
|
|
||||||
class TestDecoder(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = dv3.Decoder(8, 4, [4, 12], 5, 3, 16, 1.0, 1.45, True, 6)
|
|
||||||
x = paddle.randn([4, 32, 8])
|
|
||||||
k = paddle.randn([4, 24, 12]) # prenet's last size should equals k's feature size
|
|
||||||
v = paddle.randn([4, 24, 12])
|
|
||||||
lengths = paddle.to_tensor([24, 18, 19, 22])
|
|
||||||
condition = paddle.randn([4, 6])
|
|
||||||
decoded, hidden, attentions, final_state = net(x, k, v, lengths, 0, condition)
|
|
||||||
self.assertTupleEqual(decoded.numpy().shape, (4, 32, 4 * 8))
|
|
||||||
self.assertTupleEqual(hidden.numpy().shape, (4, 32, 12))
|
|
||||||
self.assertEqual(len(attentions), 5)
|
|
||||||
self.assertTupleEqual(attentions[0].numpy().shape, (4, 32, 24))
|
|
||||||
self.assertEqual(len(final_state), 5)
|
|
||||||
self.assertTupleEqual(final_state[0].numpy().shape, (4, 2, 12))
|
|
||||||
|
|
||||||
|
|
||||||
class TestPostNet(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = dv3.PostNet(3, 8, 16, 3, 12, 4, True, 6)
|
|
||||||
x = paddle.randn([4, 32, 8])
|
|
||||||
condition = paddle.randn([4, 6])
|
|
||||||
y = net(x, condition)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32 * 4, 12))
|
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.modules import geometry as geo
|
|
||||||
|
|
||||||
class TestShuffleDim(unittest.TestCase):
|
|
||||||
def test_perm(self):
|
|
||||||
x = paddle.randn([2, 3, 4, 6])
|
|
||||||
y = geo.shuffle_dim(x, 2, [3, 2, 1, 0])
|
|
||||||
np.testing.assert_allclose(x.numpy()[0, 0, :, 0], y.numpy()[0, 0, ::-1, 0])
|
|
||||||
|
|
||||||
def test_random_perm(self):
|
|
||||||
x = paddle.randn([2, 3, 4, 6])
|
|
||||||
y = geo.shuffle_dim(x, 2)
|
|
||||||
np.testing.assert_allclose(x.numpy().sum(2), y.numpy().sum(2))
|
|
|
@ -1,33 +0,0 @@
|
||||||
import unittest
|
|
||||||
import paddle
|
|
||||||
paddle.set_device("cpu")
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy
|
|
||||||
|
|
||||||
class TestWeightedMean(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
|
|
||||||
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
|
|
||||||
loss = weighted_mean(x, mask)
|
|
||||||
self.assertAlmostEqual(loss.numpy()[0], 7)
|
|
||||||
|
|
||||||
|
|
||||||
class TestMaskedL1Loss(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
|
|
||||||
y = paddle.zeros_like(x)
|
|
||||||
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
|
|
||||||
loss = masked_l1_loss(x, y, mask)
|
|
||||||
print(loss)
|
|
||||||
self.assertAlmostEqual(loss.numpy()[0], 7)
|
|
||||||
|
|
||||||
|
|
||||||
class TestMaskedCrossEntropy(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
x = paddle.randn([3, 30, 8], dtype="float64")
|
|
||||||
y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1)
|
|
||||||
loss = masked_softmax_with_cross_entropy(x, y, mask)
|
|
||||||
print(loss)
|
|
|
@ -1,54 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
|
|
||||||
from parakeet.modules import masking
|
|
||||||
|
|
||||||
|
|
||||||
def sequence_mask(lengths, max_length=None, dtype="bool"):
|
|
||||||
max_length = max_length or np.max(lengths)
|
|
||||||
ids = np.arange(max_length)
|
|
||||||
return (ids < np.expand_dims(lengths, -1)).astype(dtype)
|
|
||||||
|
|
||||||
def future_mask(lengths, max_length=None, dtype="bool"):
|
|
||||||
max_length = max_length or np.max(lengths)
|
|
||||||
return np.tril(np.tril(np.ones(max_length))).astype(dtype)
|
|
||||||
|
|
||||||
class TestIDMask(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
ids = paddle.to_tensor(
|
|
||||||
[[1, 2, 3, 0, 0, 0],
|
|
||||||
[2, 4, 5, 6, 0, 0],
|
|
||||||
[7, 8, 9, 0, 0, 0]]
|
|
||||||
)
|
|
||||||
mask = masking.id_mask(ids)
|
|
||||||
self.assertTupleEqual(mask.numpy().shape, ids.numpy().shape)
|
|
||||||
print(mask.numpy())
|
|
||||||
|
|
||||||
class TestFeatureMask(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
features = np.random.randn(3, 16, 8)
|
|
||||||
lengths = [16, 14, 12]
|
|
||||||
for i, length in enumerate(lengths):
|
|
||||||
features[i, length:, :] = 0
|
|
||||||
|
|
||||||
feature_tensor = paddle.to_tensor(features)
|
|
||||||
mask = masking.feature_mask(feature_tensor, -1)
|
|
||||||
self.assertTupleEqual(mask.numpy().shape, (3, 16, 1))
|
|
||||||
print(mask.numpy().squeeze())
|
|
||||||
|
|
||||||
|
|
||||||
class TestCombineMask(unittest.TestCase):
|
|
||||||
def test_bool_mask(self):
|
|
||||||
lengths = np.array([12, 8, 9, 10])
|
|
||||||
padding_mask = sequence_mask(lengths, dtype="bool")
|
|
||||||
no_future_mask = future_mask(lengths, dtype="bool")
|
|
||||||
combined_mask1 = np.expand_dims(padding_mask, 1) * no_future_mask
|
|
||||||
|
|
||||||
print(paddle.to_tensor(padding_mask).dtype)
|
|
||||||
print(paddle.to_tensor(no_future_mask).dtype)
|
|
||||||
combined_mask2 = masking.combine_mask(
|
|
||||||
paddle.to_tensor(padding_mask).unsqueeze(1), paddle.to_tensor(no_future_mask)
|
|
||||||
)
|
|
||||||
np.testing.assert_allclose(combined_mask2.numpy(), combined_mask1)
|
|
|
@ -1,64 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
|
|
||||||
from parakeet.modules import positional_encoding as pe
|
|
||||||
|
|
||||||
def positional_encoding(start_index, length, size, dtype="float32"):
|
|
||||||
if (size % 2 != 0):
|
|
||||||
raise ValueError("size should be divisible by 2")
|
|
||||||
channel = np.arange(0, size, 2, dtype=dtype)
|
|
||||||
index = np.arange(start_index, start_index + length, 1, dtype=dtype)
|
|
||||||
p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
|
|
||||||
encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
|
|
||||||
return encodings
|
|
||||||
|
|
||||||
def scalable_positional_encoding(start_index, length, size, omega):
|
|
||||||
dtype = omega.dtype
|
|
||||||
index = np.arange(start_index, start_index + length, 1, dtype=dtype)
|
|
||||||
channel = np.arange(0, size, 2, dtype=dtype)
|
|
||||||
|
|
||||||
p = np.reshape(omega, omega.shape + (1, 1)) \
|
|
||||||
* np.expand_dims(index, -1) \
|
|
||||||
/ (10000 ** (channel / float(size)))
|
|
||||||
|
|
||||||
encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
|
|
||||||
return encodings
|
|
||||||
|
|
||||||
class TestPositionEncoding(unittest.TestCase):
|
|
||||||
def __init__(self, start=0, length=20, size=16, dtype="float64"):
|
|
||||||
super(TestPositionEncoding, self).__init__("runTest")
|
|
||||||
self.spec = (start, length, size, dtype)
|
|
||||||
|
|
||||||
def test_equality(self):
|
|
||||||
start, length, size, dtype = self.spec
|
|
||||||
position_embed1 = positional_encoding(start, length, size, dtype)
|
|
||||||
position_embed2 = pe.positional_encoding(start, length, size, dtype)
|
|
||||||
np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
|
|
||||||
|
|
||||||
def runTest(self):
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
self.test_equality()
|
|
||||||
|
|
||||||
class TestScalablePositionEncoding(unittest.TestCase):
|
|
||||||
def __init__(self, start=0, length=20, size=16, dtype="float64"):
|
|
||||||
super(TestScalablePositionEncoding, self).__init__("runTest")
|
|
||||||
self.spec = (start, length, size, dtype)
|
|
||||||
|
|
||||||
def test_equality(self):
|
|
||||||
start, length, size, dtype = self.spec
|
|
||||||
omega = np.random.uniform(1, 2, size=(4,)).astype(dtype)
|
|
||||||
position_embed1 = scalable_positional_encoding(start, length, size, omega)
|
|
||||||
position_embed2 = pe.scalable_positional_encoding(start, length, size, paddle.to_tensor(omega))
|
|
||||||
np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
|
|
||||||
|
|
||||||
def runTest(self):
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
self.test_equality()
|
|
||||||
|
|
||||||
|
|
||||||
def load_tests(loader, standard_tests, pattern):
|
|
||||||
suite = unittest.TestSuite()
|
|
||||||
suite.addTest(TestPositionEncoding(0, 20, 16, "float64"))
|
|
||||||
suite.addTest(TestScalablePositionEncoding(0, 20, 16))
|
|
||||||
return suite
|
|
|
@ -1,27 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import librosa
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.modules import stft
|
|
||||||
|
|
||||||
class TestSTFT(unittest.TestCase):
|
|
||||||
def test(self):
|
|
||||||
path = librosa.util.example("choice")
|
|
||||||
wav, sr = librosa.load(path, duration=5)
|
|
||||||
wav = wav.astype("float64")
|
|
||||||
|
|
||||||
spec = librosa.stft(wav, n_fft=2048, hop_length=256, win_length=1024)
|
|
||||||
mag1 = np.abs(spec)
|
|
||||||
|
|
||||||
wav_in_batch = paddle.unsqueeze(paddle.to_tensor(wav), 0)
|
|
||||||
mag2 = stft.STFT(2048, 256, 1024).magnitude(wav_in_batch)
|
|
||||||
mag2 = paddle.squeeze(mag2, [0, 2]).numpy()
|
|
||||||
|
|
||||||
print("mag1", mag1)
|
|
||||||
print("mag2", mag2)
|
|
||||||
# TODO(chenfeiyu): Is there something wrong? there is some elements that
|
|
||||||
# does not match
|
|
||||||
# np.testing.assert_allclose(mag2, mag1)
|
|
|
@ -1,43 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.modules import transformer
|
|
||||||
|
|
||||||
class TestPositionwiseFFN(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = transformer.PositionwiseFFN(8, 12)
|
|
||||||
x = paddle.randn([2, 3, 4, 8])
|
|
||||||
y = net(x)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (2, 3, 4, 8))
|
|
||||||
|
|
||||||
|
|
||||||
class TestTransformerEncoderLayer(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = transformer.TransformerEncoderLayer(64, 8, 128, 0.5)
|
|
||||||
x = paddle.randn([4, 12, 64])
|
|
||||||
lengths = paddle.to_tensor([12, 8, 9, 10])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(lengths, dtype=x.dtype)
|
|
||||||
y, attn_weights = net(x, mask)
|
|
||||||
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 12, 64))
|
|
||||||
self.assertTupleEqual(attn_weights.numpy().shape, (4, 8, 12, 12))
|
|
||||||
|
|
||||||
|
|
||||||
class TestTransformerDecoderLayer(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = transformer.TransformerDecoderLayer(64, 8, 128, 0.5)
|
|
||||||
q = paddle.randn([4, 32, 64])
|
|
||||||
k = paddle.randn([4, 24, 64])
|
|
||||||
v = paddle.randn([4, 24, 64])
|
|
||||||
enc_lengths = paddle.to_tensor([24, 18, 20, 22])
|
|
||||||
dec_lengths = paddle.to_tensor([32, 28, 30, 31])
|
|
||||||
enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype)
|
|
||||||
dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype)
|
|
||||||
y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
|
|
||||||
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
|
|
||||||
self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
|
|
||||||
self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
|
|
|
@ -1,121 +0,0 @@
|
||||||
import unittest
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.models import transformer_tts as tts
|
|
||||||
from parakeet.modules import masking
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
class TestMultiheadAttention(unittest.TestCase):
|
|
||||||
def test_io_same_qk(self):
|
|
||||||
net = tts.MultiheadAttention(64, 8)
|
|
||||||
q = paddle.randn([4, 12, 64])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
|
|
||||||
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
|
|
||||||
context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
|
|
||||||
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
|
|
||||||
|
|
||||||
def test_io(self):
|
|
||||||
net = tts.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
|
|
||||||
q = paddle.randn([4, 12, 64])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
|
|
||||||
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
|
|
||||||
context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
|
|
||||||
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
|
|
||||||
|
|
||||||
|
|
||||||
class TestTransformerEncoderLayer(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = tts.TransformerEncoderLayer(64, 8, 128)
|
|
||||||
x = paddle.randn([4, 12, 64])
|
|
||||||
mask = paddle.fluid.layers.sequence_mask(
|
|
||||||
paddle.to_tensor([12, 10, 8, 9]), dtype=x.dtype)
|
|
||||||
context_vector, attention_weights = net(x, mask)
|
|
||||||
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
|
|
||||||
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
|
|
||||||
|
|
||||||
|
|
||||||
class TestTransformerDecoderLayer(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = tts.TransformerDecoderLayer(64, 8, 128, 0.5)
|
|
||||||
q = paddle.randn([4, 32, 64])
|
|
||||||
k = paddle.randn([4, 24, 64])
|
|
||||||
v = paddle.randn([4, 24, 64])
|
|
||||||
enc_lengths = paddle.to_tensor([24, 18, 20, 22])
|
|
||||||
dec_lengths = paddle.to_tensor([32, 28, 30, 31])
|
|
||||||
enc_mask = masking.sequence_mask(enc_lengths, dtype=k.dtype)
|
|
||||||
dec_padding_mask = masking.sequence_mask(dec_lengths, dtype=q.dtype)
|
|
||||||
no_future_mask = masking.future_mask(32, dtype=q.dtype)
|
|
||||||
dec_mask = masking.combine_mask(dec_padding_mask.unsqueeze(-1), no_future_mask)
|
|
||||||
y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
|
|
||||||
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
|
|
||||||
self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
|
|
||||||
self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
|
|
||||||
|
|
||||||
|
|
||||||
class TestTransformerTTS(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
net = tts.TransformerTTS(
|
|
||||||
128, 0, 64, 128, 80, 4, 128,
|
|
||||||
6, 6, 128, 128, 4,
|
|
||||||
3, 10, 0.1)
|
|
||||||
self.net = net
|
|
||||||
|
|
||||||
def test_encode_io(self):
|
|
||||||
net = self.net
|
|
||||||
|
|
||||||
text = paddle.randint(0, 128, [4, 176])
|
|
||||||
lengths = paddle.to_tensor([176, 156, 174, 168])
|
|
||||||
mask = masking.sequence_mask(lengths, dtype=text.dtype)
|
|
||||||
text = text * mask
|
|
||||||
|
|
||||||
encoded, attention_weights, encoder_mask = net.encode(text)
|
|
||||||
print("output shapes:")
|
|
||||||
print("encoded:", encoded.numpy().shape)
|
|
||||||
print("encoder_attentions:", [item.shape for item in attention_weights])
|
|
||||||
print("encoder_mask:", encoder_mask.numpy().shape)
|
|
||||||
|
|
||||||
def test_all_io(self):
|
|
||||||
net = self.net
|
|
||||||
|
|
||||||
text = paddle.randint(0, 128, [4, 176])
|
|
||||||
lengths = paddle.to_tensor([176, 156, 174, 168])
|
|
||||||
mask = masking.sequence_mask(lengths, dtype=text.dtype)
|
|
||||||
text = text * mask
|
|
||||||
|
|
||||||
mel = paddle.randn([4, 189, 80])
|
|
||||||
frames = paddle.to_tensor([189, 186, 179, 174])
|
|
||||||
mask = masking.sequence_mask(frames, dtype=frames.dtype)
|
|
||||||
mel = mel * mask.unsqueeze(-1)
|
|
||||||
|
|
||||||
encoded, encoder_attention_weights, encoder_mask = net.encode(text)
|
|
||||||
mel_output, mel_intermediate, cross_attention_weights, stop_logits = net.decode(encoded, mel, encoder_mask)
|
|
||||||
|
|
||||||
print("output shapes:")
|
|
||||||
print("encoder_output:", encoded.numpy().shape)
|
|
||||||
print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
|
|
||||||
print("encoder_mask:", encoder_mask.numpy().shape)
|
|
||||||
print("mel_output: ", mel_output.numpy().shape)
|
|
||||||
print("mel_intermediate: ", mel_intermediate.numpy().shape)
|
|
||||||
print("decoder_attentions:", [item.shape for item in cross_attention_weights])
|
|
||||||
print("stop_logits:", stop_logits.numpy().shape)
|
|
||||||
|
|
||||||
def test_predict_io(self):
|
|
||||||
net = self.net
|
|
||||||
net.eval()
|
|
||||||
with paddle.no_grad():
|
|
||||||
text = paddle.randint(0, 128, [176])
|
|
||||||
decoder_output, encoder_attention_weights, cross_attention_weights = net.predict(text)
|
|
||||||
|
|
||||||
print("output shapes:")
|
|
||||||
print("mel_output: ", decoder_output.numpy().shape)
|
|
||||||
print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
|
|
||||||
print("decoder_attentions:", [item.shape for item in cross_attention_weights])
|
|
||||||
|
|
|
@ -1,130 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
paddle.set_default_dtype("float64")
|
|
||||||
paddle.disable_static(paddle.CPUPlace())
|
|
||||||
|
|
||||||
from parakeet.models import waveflow
|
|
||||||
|
|
||||||
class TestFold(unittest.TestCase):
|
|
||||||
def test_audio(self):
|
|
||||||
x = paddle.randn([4, 32 * 8])
|
|
||||||
y = waveflow.fold(x, 8)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 32, 8))
|
|
||||||
|
|
||||||
def test_spec(self):
|
|
||||||
x = paddle.randn([4, 80, 32 * 8])
|
|
||||||
y = waveflow.fold(x, 8)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 80, 32, 8))
|
|
||||||
|
|
||||||
|
|
||||||
class TestUpsampleNet(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = waveflow.UpsampleNet([2, 2])
|
|
||||||
x = paddle.randn([4, 8, 6])
|
|
||||||
y = net(x)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 8, 2 * 2 * 6))
|
|
||||||
|
|
||||||
|
|
||||||
class TestResidualBlock(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
|
|
||||||
x = paddle.randn([4, 4, 16, 32])
|
|
||||||
condition = paddle.randn([4, 6, 16, 32])
|
|
||||||
res, skip = net(x, condition)
|
|
||||||
self.assertTupleEqual(res.numpy().shape, (4, 4, 16, 32))
|
|
||||||
self.assertTupleEqual(skip.numpy().shape, (4, 4, 16, 32))
|
|
||||||
|
|
||||||
def test_add_input(self):
|
|
||||||
net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
|
|
||||||
net.eval()
|
|
||||||
net.start_sequence()
|
|
||||||
|
|
||||||
x_row = paddle.randn([4, 4, 1, 32])
|
|
||||||
condition_row = paddle.randn([4, 6, 1, 32])
|
|
||||||
|
|
||||||
res, skip = net.add_input(x_row, condition_row)
|
|
||||||
self.assertTupleEqual(res.numpy().shape, (4, 4, 1, 32))
|
|
||||||
self.assertTupleEqual(skip.numpy().shape, (4, 4, 1, 32))
|
|
||||||
|
|
||||||
|
|
||||||
class TestResidualNet(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
|
|
||||||
x = paddle.randn([4, 6, 8, 32])
|
|
||||||
condition = paddle.randn([4, 8, 8, 32])
|
|
||||||
y = net(x, condition)
|
|
||||||
self.assertTupleEqual(y.numpy().shape, (4, 6, 8, 32))
|
|
||||||
|
|
||||||
def test_add_input(self):
|
|
||||||
net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
|
|
||||||
net.eval()
|
|
||||||
net.start_sequence()
|
|
||||||
|
|
||||||
x_row = paddle.randn([4, 6, 1, 32])
|
|
||||||
condition_row = paddle.randn([4, 8, 1, 32])
|
|
||||||
|
|
||||||
y_row = net.add_input(x_row, condition_row)
|
|
||||||
self.assertTupleEqual(y_row.numpy().shape, (4, 6, 1, 32))
|
|
||||||
|
|
||||||
|
|
||||||
class TestFlow(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
|
|
||||||
|
|
||||||
x = paddle.randn([4, 1, 8, 32])
|
|
||||||
condition = paddle.randn([4, 7, 8, 32])
|
|
||||||
z, (logs, b) = net(x, condition)
|
|
||||||
self.assertTupleEqual(z.numpy().shape, (4, 1, 8, 32))
|
|
||||||
self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
|
|
||||||
self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
|
|
||||||
|
|
||||||
def test_inverse_row(self):
|
|
||||||
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
|
|
||||||
net.eval()
|
|
||||||
net._start_sequence()
|
|
||||||
|
|
||||||
x_row = paddle.randn([4, 1, 1, 32]) # last row
|
|
||||||
condition_row = paddle.randn([4, 7, 1, 32])
|
|
||||||
z_row = paddle.randn([4, 1, 1, 32])
|
|
||||||
x_next_row, (logs, b) = net._inverse_row(z_row, x_row, condition_row)
|
|
||||||
|
|
||||||
self.assertTupleEqual(x_next_row.numpy().shape, (4, 1, 1, 32))
|
|
||||||
self.assertTupleEqual(logs.numpy().shape, (4, 1, 1, 32))
|
|
||||||
self.assertTupleEqual(b.numpy().shape, (4, 1, 1, 32))
|
|
||||||
|
|
||||||
def test_inverse(self):
|
|
||||||
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
|
|
||||||
net.eval()
|
|
||||||
|
|
||||||
z = paddle.randn([4, 1, 8, 32])
|
|
||||||
condition = paddle.randn([4, 7, 8, 32])
|
|
||||||
|
|
||||||
with paddle.no_grad():
|
|
||||||
x, (logs, b) = net.inverse(z, condition)
|
|
||||||
self.assertTupleEqual(x.numpy().shape, (4, 1, 8, 32))
|
|
||||||
self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
|
|
||||||
self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
|
|
||||||
|
|
||||||
|
|
||||||
class TestWaveFlow(unittest.TestCase):
|
|
||||||
def test_io(self):
|
|
||||||
x = paddle.randn([4, 32 * 8 ])
|
|
||||||
condition = paddle.randn([4, 7, 32 * 8])
|
|
||||||
net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
|
|
||||||
z, logs_det_jacobian = net(x, condition)
|
|
||||||
|
|
||||||
self.assertTupleEqual(z.numpy().shape, (4, 32 * 8))
|
|
||||||
self.assertTupleEqual(logs_det_jacobian.numpy().shape, (1,))
|
|
||||||
|
|
||||||
def test_inverse(self):
|
|
||||||
z = paddle.randn([4, 32 * 8 ])
|
|
||||||
condition = paddle.randn([4, 7, 32 * 8])
|
|
||||||
|
|
||||||
net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
|
|
||||||
net.eval()
|
|
||||||
|
|
||||||
with paddle.no_grad():
|
|
||||||
x = net.inverse(z, condition)
|
|
||||||
self.assertTupleEqual(x.numpy().shape, (4, 32 * 8))
|
|
Loading…
Reference in New Issue