Merge pull request #66 from iclementine/reborn

format code and discard opencc
This commit is contained in:
Feiyu Chan 2020-12-20 13:53:31 +08:00 committed by GitHub
commit fe7ddc2aaf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
72 changed files with 1258 additions and 1571 deletions

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder. # Configuration file for the Sphinx documentation builder.
# #
# This file only contains a selection of the most common options. For a full # This file only contains a selection of the most common options. For a full
@ -14,7 +28,6 @@
# import sys # import sys
# sys.path.insert(0, os.path.abspath('.')) # sys.path.insert(0, os.path.abspath('.'))
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'parakeet' project = 'parakeet'
@ -24,7 +37,6 @@ author = 'parakeet-developers'
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = '0.2' release = '0.2'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be # Add any Sphinx extension module names here, as strings. They can be
@ -46,7 +58,6 @@ templates_path = ['_templates']
# This pattern also affects html_static_path and html_extra_path. # This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [] exclude_patterns = []
# -- Options for HTML output ------------------------------------------------- # -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for

View File

@ -102,11 +102,3 @@ optional arguments:
--opts ... options to overwrite --config file and the default --opts ... options to overwrite --config file and the default
config, passing in KEY VALUE pairs config, passing in KEY VALUE pairs
``` ```

View File

@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+
``` ```
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。 在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。

View File

@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了
1. 可复用的模型以及常用的模块; 1. 可复用的模型以及常用的模块;
2. 从数据处理,模型训练到预测等一系列过程的完整实验; 2. 从数据处理,模型训练到预测等一系列过程的完整实验;
3. 高质量的开箱即用模型。 3. 高质量的开箱即用模型。

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN from yacs.config import CfgNode as CN
_C = CN() _C = CN()
@ -14,8 +28,7 @@ _C.data = CN(
padding_idx=0, # text embedding's padding index padding_idx=0, # text embedding's padding index
mel_start_value=0.5, # value for starting frame mel_start_value=0.5, # value for starting frame
mel_end_value=-0.5, # # value for ending frame mel_end_value=-0.5, # # value for ending frame
) ))
)
_C.model = CN( _C.model = CN(
dict( dict(
@ -33,8 +46,7 @@ _C.model = CN(
dropout=0.1, # global droput probability dropout=0.1, # global droput probability
stop_loss_scale=8.0, # scaler for stop _loss stop_loss_scale=8.0, # scaler for stop _loss
decoder_prenet_dropout=0.5, # decoder prenet dropout probability decoder_prenet_dropout=0.5, # decoder prenet dropout probability
) ))
)
_C.training = CN( _C.training = CN(
dict( dict(
@ -45,8 +57,8 @@ _C.training = CN(
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=900000, # max iteration to train max_iteration=900000, # max iteration to train
) ))
)
def get_cfg_defaults(): def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project.""" """Get a yacs CfgNode object with default values for my_project."""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader
from parakeet.data.batch import batch_spec, batch_text_id from parakeet.data.batch import batch_spec, batch_text_id
from parakeet.data import dataset from parakeet.data import dataset
class LJSpeech(Dataset): class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset.""" """A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
records = [] records = []
@ -38,8 +54,8 @@ class Transform(object):
ids, mel = example # ids already have <s> and </s> ids, mel = example # ids already have <s> and </s>
ids = np.array(ids, dtype=np.int64) ids = np.array(ids, dtype=np.int64)
# add start and end frame # add start and end frame
mel = np.pad(mel, mel = np.pad(
[(0, 0), (1, 1)], mel, [(0, 0), (1, 1)],
mode='constant', mode='constant',
constant_values=[(0, 0), (self.start_value, self.end_value)]) constant_values=[(0, 0), (self.start_value, self.end_value)])
stop_labels = np.ones([mel.shape[1]], dtype=np.int64) stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
@ -50,6 +66,7 @@ class Transform(object):
class LJSpeechCollector(object): class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples.""" """A simple callable to batch LJSpeech examples."""
def __init__(self, padding_idx=0, padding_value=0.): def __init__(self, padding_idx=0, padding_value=0.):
self.padding_idx = padding_idx self.padding_idx = padding_idx
self.padding_value = padding_value self.padding_value = padding_value
@ -67,7 +84,8 @@ class LJSpeechCollector(object):
def create_dataloader(config, source_path): def create_dataloader(config, source_path):
lj = LJSpeech(source_path) lj = LJSpeech(source_path)
transform = Transform(config.data.mel_start_value, config.data.mel_end_value) transform = Transform(config.data.mel_start_value,
config.data.mel_end_value)
lj = dataset.TransformDataset(lj, transform) lj = dataset.TransformDataset(lj, transform)
valid_set, train_set = dataset.split(lj, config.data.valid_size) valid_set, train_set = dataset.split(lj, config.data.valid_size)
@ -85,4 +103,3 @@ def create_dataloader(config, source_path):
drop_last=False, drop_last=False,
collate_fn=data_collator) collate_fn=data_collator)
return train_loader, valid_loader return train_loader, valid_loader

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import tqdm import tqdm
import pickle import pickle
@ -11,6 +25,7 @@ from parakeet.frontend import English
from config import get_cfg_defaults from config import get_cfg_defaults
def create_dataset(config, source_path, target_path, verbose=False): def create_dataset(config, source_path, target_path, verbose=False):
# create output dir # create output dir
target_path = Path(target_path).expanduser() target_path = Path(target_path).expanduser()
@ -47,7 +62,8 @@ def create_dataset(config, source_path, target_path, verbose=False):
with open(target_path / "metadata.pkl", 'wb') as f: with open(target_path / "metadata.pkl", 'wb') as f:
pickle.dump(records, f) pickle.dump(records, f)
if verbose: if verbose:
print("saved metadata into {}".format(target_path / "metadata.pkl")) print("saved metadata into {}".format(target_path /
"metadata.pkl"))
# also save meta data into text format for inspection # also save meta data into text format for inspection
with open(target_path / "metadata.txt", 'wt') as f: with open(target_path / "metadata.txt", 'wt') as f:
@ -55,20 +71,30 @@ def create_dataset(config, source_path, target_path, verbose=False):
phoneme_str = "|".join(phonemes) phoneme_str = "|".join(phonemes)
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str)) f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
if verbose: if verbose:
print("saved metadata into {}".format(target_path / "metadata.txt")) print("saved metadata into {}".format(target_path /
"metadata.txt"))
print("Done.") print("Done.")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset") parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") parser.add_argument(
parser.add_argument("--input", type=str, help="path of the ljspeech dataset") "--config",
parser.add_argument("--output", type=str, help="path to save output dataset") type=str,
parser.add_argument("--opts", nargs=argparse.REMAINDER, metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults() config = get_cfg_defaults()
args = parser.parse_args() args = parser.parse_args()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import time import time
from pathlib import Path from pathlib import Path
@ -13,14 +27,15 @@ from parakeet.utils.display import add_attention_plots
from config import get_cfg_defaults from config import get_cfg_defaults
@paddle.fluid.dygraph.no_grad @paddle.fluid.dygraph.no_grad
def main(config, args): def main(config, args):
paddle.set_device(args.device) paddle.set_device(args.device)
# model # model
frontend = English() frontend = English()
model = TransformerTTS.from_pretrained( model = TransformerTTS.from_pretrained(frontend, config,
frontend, config, args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# inputs # inputs
@ -38,19 +53,33 @@ def main(config, args):
mel_output = mel_output.T #(C, T) mel_output = mel_output.T #(C, T)
np.save(str(output_dir / f"sentence_{i}"), mel_output) np.save(str(output_dir / f"sentence_{i}"), mel_output)
if args.verbose: if args.verbose:
print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy")) print("spectrogram saved at {}".format(output_dir /
f"sentence_{i}.npy"))
if __name__ == "__main__": if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") parser = argparse.ArgumentParser(
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument(
"--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument("--input", type=str, help="path of the text sentences") parser.add_argument("--input", type=str, help="path of the text sentences")
parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device", type=str, default="cpu", help="device type to use.") parser.add_argument(
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--device", type=str, default="cpu", help="device type to use.")
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
import logging import logging
from pathlib import Path from pathlib import Path
@ -19,6 +33,7 @@ from parakeet.training.experiment import ExperimentBase
from config import get_cfg_defaults from config import get_cfg_defaults
from ljspeech import LJSpeech, LJSpeechCollector, Transform from ljspeech import LJSpeech, LJSpeechCollector, Transform
class Experiment(ExperimentBase): class Experiment(ExperimentBase):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
@ -46,8 +61,7 @@ class Experiment(ExperimentBase):
beta1=0.9, beta1=0.9,
beta2=0.98, beta2=0.98,
epsilon=1e-9, epsilon=1e-9,
parameters=model.parameters() parameters=model.parameters())
)
criterion = TransformerTTSLoss(config.model.stop_loss_scale) criterion = TransformerTTSLoss(config.model.stop_loss_scale)
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
reduction_factor = scheduler.StepWise(config.training.reduction_factor) reduction_factor = scheduler.StepWise(config.training.reduction_factor)
@ -63,9 +77,12 @@ class Experiment(ExperimentBase):
config = self.config config = self.config
ljspeech_dataset = LJSpeech(args.data) ljspeech_dataset = LJSpeech(args.data)
transform = Transform(config.data.mel_start_value, config.data.mel_end_value) transform = Transform(config.data.mel_start_value,
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform) config.data.mel_end_value)
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
transform)
valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx) batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
if not self.parallel: if not self.parallel:
@ -99,7 +116,7 @@ class Experiment(ExperimentBase):
self.drop_n_heads(self.iteration)) self.drop_n_heads(self.iteration))
# TODO(chenfeiyu): we can combine these 2 slices # TODO(chenfeiyu): we can combine these 2 slices
mel_input = mel[:,:-1, :] mel_input = mel[:, :-1, :]
reduced_mel_input = mel_input[:, ::model_core.r, :] reduced_mel_input = mel_input[:, ::model_core.r, :]
outputs = self.model(text, reduced_mel_input) outputs = self.model(text, reduced_mel_input)
return outputs return outputs
@ -115,11 +132,8 @@ class Experiment(ExperimentBase):
time_steps = mel_target.shape[1] time_steps = mel_target.shape[1]
losses = self.criterion( losses = self.criterion(
mel_output[:,:time_steps, :], mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
mel_intermediate[:,:time_steps, :], mel_target, stop_logits[:, :time_steps, :], stop_label_target)
mel_target,
stop_logits[:,:time_steps, :],
stop_label_target)
return losses return losses
def train_batch(self): def train_batch(self):
@ -141,13 +155,16 @@ class Experiment(ExperimentBase):
# logging # logging
msg = "Rank: {}, ".format(dist.get_rank()) msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) iteration_time)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
self.logger.info(msg) self.logger.info(msg)
if dist.get_rank() == 0: if dist.get_rank() == 0:
for k, v in losses_np.items(): for k, v in losses_np.items():
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration) self.visualizer.add_scalar(f"train_loss/{k}", v,
self.iteration)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
@ -165,8 +182,7 @@ class Experiment(ExperimentBase):
display.add_multi_attention_plots( display.add_multi_attention_plots(
self.visualizer, self.visualizer,
f"valid_sentence_{i}_cross_attention_weights", f"valid_sentence_{i}_cross_attention_weights",
attention_weights, attention_weights, self.iteration)
self.iteration)
# write visual log # write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN from yacs.config import CfgNode as CN
_C = CN() _C = CN()
@ -12,8 +26,7 @@ _C.data = CN(
f_max=8000, # Hz, max frequency when converting to mel f_max=8000, # Hz, max frequency when converting to mel
n_mels=80, # mel bands n_mels=80, # mel bands
clip_frames=65, # mel clip frames clip_frames=65, # mel clip frames
) ))
)
_C.model = CN( _C.model = CN(
dict( dict(
@ -24,8 +37,7 @@ _C.model = CN(
channels=128, # resiaudal channel in each flow channels=128, # resiaudal channel in each flow
kernel_size=[3, 3], # kernel size in each conv block kernel_size=[3, 3], # kernel size in each conv block
sigma=1.0, # stddev of the random noise sigma=1.0, # stddev of the random noise
) ))
)
_C.training = CN( _C.training = CN(
dict( dict(
@ -33,8 +45,8 @@ _C.training = CN(
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=3000000, # max iteration to train max_iteration=3000000, # max iteration to train
) ))
)
def get_cfg_defaults(): def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project.""" """Get a yacs CfgNode object with default values for my_project."""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
from parakeet.data import dataset from parakeet.data import dataset
from parakeet.audio import AudioProcessor from parakeet.audio import AudioProcessor
class LJSpeech(Dataset): class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset.""" """A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
meta_data = pandas.read_csv( meta_data = pandas.read_csv(
str(self.root / "metadata.csv"), str(self.root / "metadata.csv"),
sep="\t", sep="\t",
header=None, header=None,
names=["fname", "frames", "samples"] names=["fname", "frames", "samples"])
)
records = [] records = []
for row in meta_data.itertuples() : for row in meta_data.itertuples():
mel_path = str(self.root / "mel" / (row.fname + ".npy")) mel_path = str(self.root / "mel" / (row.fname + ".npy"))
wav_path = str(self.root / "wav" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy"))
records.append((mel_path, wav_path)) records.append((mel_path, wav_path))
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object): class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples.""" """A simple callable to batch LJSpeech examples."""
def __init__(self, padding_value=0.): def __init__(self, padding_value=0.):
self.padding_value = padding_value self.padding_value = padding_value
@ -70,9 +86,7 @@ class LJSpeechClipCollector(object):
mel, wav = example mel, wav = example
frames = mel.shape[-1] frames = mel.shape[-1]
start = np.random.randint(0, frames - self.clip_frames) start = np.random.randint(0, frames - self.clip_frames)
mel_clip = mel[:, start: start + self.clip_frames] mel_clip = mel[:, start:start + self.clip_frames]
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
self.hop_length]
return mel_clip, wav_clip return mel_clip, wav_clip

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import tqdm import tqdm
import csv import csv
@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
transform = Transform( transform = Transform(config.sample_rate, config.n_fft, config.win_length,
config.sample_rate, config.hop_length, config.n_mels)
config.n_fft,
config.win_length,
config.hop_length,
config.n_mels)
file_names = [] file_names = []
for example in tqdm.tqdm(dataset): for example in tqdm.tqdm(dataset):
@ -109,20 +119,32 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
file_names.append((base_name, mel.shape[-1], audio.shape[-1])) file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
meta_data = pd.DataFrame.from_records(file_names) meta_data = pd.DataFrame.from_records(file_names)
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) meta_data.to_csv(
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
print("saved meta data in to {}".format(
os.path.join(output_dir, "metadata.csv")))
print("Done!") print("Done!")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset") parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") parser.add_argument(
parser.add_argument("--input", type=str, help="path of the ljspeech dataset") "--config",
parser.add_argument("--output", type=str, help="path to save output dataset") type=str,
parser.add_argument("--opts", nargs=argparse.REMAINDER, metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults() config = get_cfg_defaults()
args = parser.parse_args() args = parser.parse_args()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
@ -8,9 +22,9 @@ import parakeet
from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
from parakeet.utils import layer_tools, checkpoint from parakeet.utils import layer_tools, checkpoint
from config import get_cfg_defaults from config import get_cfg_defaults
def main(config, args): def main(config, args):
paddle.set_device(args.device) paddle.set_device(args.device)
model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path) model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
@ -23,7 +37,8 @@ def main(config, args):
for file_path in mel_dir.iterdir(): for file_path in mel_dir.iterdir():
mel = np.load(str(file_path)) mel = np.load(str(file_path))
audio = model.predict(mel) audio = model.predict(mel)
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") audio_path = output_dir / (
os.path.splitext(file_path.name)[0] + ".wav")
sf.write(audio_path, audio, config.data.sample_rate) sf.write(audio_path, audio, config.data.sample_rate)
print("[synthesize] {} -> {}".format(file_path, audio_path)) print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -31,14 +46,29 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") parser = argparse.ArgumentParser(
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument(
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") "--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument(
"--input",
type=str,
help="path of directory containing mel spectrogram (in .npy format)")
parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device", type=str, default="cpu", help="device type to use.") parser.add_argument(
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--device", type=str, default="cpu", help="device type to use.")
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -34,7 +48,8 @@ class Experiment(ExperimentBase):
if self.parallel > 1: if self.parallel > 1:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters()) optimizer = paddle.optimizer.Adam(
config.training.lr, parameters=model.parameters())
criterion = WaveFlowLoss(sigma=config.model.sigma) criterion = WaveFlowLoss(sigma=config.model.sigma)
self.model = model self.model = model
@ -46,9 +61,11 @@ class Experiment(ExperimentBase):
args = self.args args = self.args
ljspeech_dataset = LJSpeech(args.data) ljspeech_dataset = LJSpeech(args.data)
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length) batch_fn = LJSpeechClipCollector(config.data.clip_frames,
config.data.hop_length)
if not self.parallel: if not self.parallel:
train_loader = DataLoader( train_loader = DataLoader(
@ -97,10 +114,12 @@ class Experiment(ExperimentBase):
loss_value = float(loss) loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank()) msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += "loss: {:>.6f}".format(loss_value) msg += "loss: {:>.6f}".format(loss_value)
self.logger.info(msg) self.logger.info(msg)
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) self.visualizer.add_scalar(
"train/loss", loss_value, global_step=self.iteration)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
@ -112,7 +131,8 @@ class Experiment(ExperimentBase):
loss = self.criterion(z, log_det_jocobian) loss = self.criterion(z, log_det_jocobian)
valid_losses.append(float(loss)) valid_losses.append(float(loss))
valid_loss = np.mean(valid_losses) valid_loss = np.mean(valid_losses)
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) self.visualizer.add_scalar(
"valid/loss", valid_loss, global_step=self.iteration)
def main_sp(config, args): def main_sp(config, args):

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN from yacs.config import CfgNode as CN
_C = CN() _C = CN()
@ -12,8 +26,7 @@ _C.data = CN(
# f_max=8000, # Hz, max frequency when converting to mel # f_max=8000, # Hz, max frequency when converting to mel
n_mels=80, # mel bands n_mels=80, # mel bands
train_clip_seconds=0.5, # audio clip length(in seconds) train_clip_seconds=0.5, # audio clip length(in seconds)
) ))
)
_C.model = CN( _C.model = CN(
dict( dict(
@ -24,9 +37,7 @@ _C.model = CN(
residual_channels=128, # resiaudal channel in each flow residual_channels=128, # resiaudal channel in each flow
loss_type="mog", loss_type="mog",
output_dim=3, # single gaussian output_dim=3, # single gaussian
log_scale_min=-9.0, log_scale_min=-9.0, ))
)
)
_C.training = CN( _C.training = CN(
dict( dict(
@ -37,8 +48,8 @@ _C.training = CN(
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=3000000, # max iteration to train max_iteration=3000000, # max iteration to train
gradient_max_norm=100.0 # global norm of gradients gradient_max_norm=100.0 # global norm of gradients
) ))
)
def get_cfg_defaults(): def get_cfg_defaults():
"""Get a yacs CfgNode object with default values for my_project.""" """Get a yacs CfgNode object with default values for my_project."""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -9,19 +23,20 @@ from parakeet.data.batch import batch_spec, batch_wav
from parakeet.data import dataset from parakeet.data import dataset
from parakeet.audio import AudioProcessor from parakeet.audio import AudioProcessor
class LJSpeech(Dataset): class LJSpeech(Dataset):
"""A simple dataset adaptor for the processed ljspeech dataset.""" """A simple dataset adaptor for the processed ljspeech dataset."""
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
meta_data = pandas.read_csv( meta_data = pandas.read_csv(
str(self.root / "metadata.csv"), str(self.root / "metadata.csv"),
sep="\t", sep="\t",
header=None, header=None,
names=["fname", "frames", "samples"] names=["fname", "frames", "samples"])
)
records = [] records = []
for row in meta_data.itertuples() : for row in meta_data.itertuples():
mel_path = str(self.root / "mel" / (row.fname + ".npy")) mel_path = str(self.root / "mel" / (row.fname + ".npy"))
wav_path = str(self.root / "wav" / (row.fname + ".npy")) wav_path = str(self.root / "wav" / (row.fname + ".npy"))
records.append((mel_path, wav_path)) records.append((mel_path, wav_path))
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
class LJSpeechCollector(object): class LJSpeechCollector(object):
"""A simple callable to batch LJSpeech examples.""" """A simple callable to batch LJSpeech examples."""
def __init__(self, padding_value=0.): def __init__(self, padding_value=0.):
self.padding_value = padding_value self.padding_value = padding_value
@ -48,7 +64,7 @@ class LJSpeechCollector(object):
wavs = [example[1] for example in examples] wavs = [example[1] for example in examples]
mels = batch_spec(mels, pad_value=self.padding_value) mels = batch_spec(mels, pad_value=self.padding_value)
wavs = batch_wav(wavs, pad_value=self.padding_value) wavs = batch_wav(wavs, pad_value=self.padding_value)
audio_starts = np.zeros((batch_size,), dtype=np.int64) audio_starts = np.zeros((batch_size, ), dtype=np.int64)
return mels, wavs, audio_starts return mels, wavs, audio_starts
@ -75,7 +91,8 @@ class LJSpeechClipCollector(object):
mel, wav = example mel, wav = example
frames = mel.shape[-1] frames = mel.shape[-1]
start = np.random.randint(0, frames - self.clip_frames) start = np.random.randint(0, frames - self.clip_frames)
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length] wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
self.hop_length]
return mel, wav_clip, start return mel, wav_clip, start
@ -132,7 +149,3 @@ class DataCollector(object):
audios = np.array(audios, dtype=np.float32) audios = np.array(audios, dtype=np.float32)
audio_starts = np.array(audio_starts, dtype=np.int64) audio_starts = np.array(audio_starts, dtype=np.int64)
return audios, mels, audio_starts return audios, mels, audio_starts

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import tqdm import tqdm
import csv import csv
@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
transform = Transform( transform = Transform(config.sample_rate, config.n_fft, config.win_length,
config.sample_rate, config.hop_length, config.n_mels)
config.n_fft,
config.win_length,
config.hop_length,
config.n_mels)
file_names = [] file_names = []
for example in tqdm.tqdm(dataset): for example in tqdm.tqdm(dataset):
@ -110,20 +120,32 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
file_names.append((base_name, mel.shape[-1], audio.shape[-1])) file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
meta_data = pd.DataFrame.from_records(file_names) meta_data = pd.DataFrame.from_records(file_names)
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None) meta_data.to_csv(
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv"))) str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
print("saved meta data in to {}".format(
os.path.join(output_dir, "metadata.csv")))
print("Done!") print("Done!")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create dataset") parser = argparse.ArgumentParser(description="create dataset")
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") parser.add_argument(
parser.add_argument("--input", type=str, help="path of the ljspeech dataset") "--config",
parser.add_argument("--output", type=str, help="path to save output dataset") type=str,
parser.add_argument("--opts", nargs=argparse.REMAINDER, metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--input", type=str, help="path of the ljspeech dataset")
parser.add_argument(
"--output", type=str, help="path to save output dataset")
parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs" help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
config = get_cfg_defaults() config = get_cfg_defaults()
args = parser.parse_args() args = parser.parse_args()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint
from config import get_cfg_defaults from config import get_cfg_defaults
def main(config, args): def main(config, args):
paddle.set_device(args.device) paddle.set_device(args.device)
model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path) model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
@ -22,7 +37,8 @@ def main(config, args):
for file_path in mel_dir.iterdir(): for file_path in mel_dir.iterdir():
mel = np.load(str(file_path)) mel = np.load(str(file_path))
audio = model.predict(mel) audio = model.predict(mel)
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav") audio_path = output_dir / (
os.path.splitext(file_path.name)[0] + ".wav")
sf.write(audio_path, audio, config.data.sample_rate) sf.write(audio_path, audio, config.data.sample_rate)
print("[synthesize] {} -> {}".format(file_path, audio_path)) print("[synthesize] {} -> {}".format(file_path, audio_path))
@ -30,14 +46,29 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
config = get_cfg_defaults() config = get_cfg_defaults()
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.") parser = argparse.ArgumentParser(
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config") description="generate mel spectrogram with TransformerTTS.")
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.") parser.add_argument(
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)") "--config",
type=str,
metavar="FILE",
help="extra config to overwrite the default config")
parser.add_argument(
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
parser.add_argument(
"--input",
type=str,
help="path of directory containing mel spectrogram (in .npy format)")
parser.add_argument("--output", type=str, help="path to save outputs") parser.add_argument("--output", type=str, help="path to save outputs")
parser.add_argument("--device", type=str, default="cpu", help="device type to use.") parser.add_argument(
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--device", type=str, default="cpu", help="device type to use.")
parser.add_argument("-v", "--verbose", action="store_true", help="print msg") parser.add_argument(
"--opts",
nargs=argparse.REMAINDER,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="print msg")
args = parser.parse_args() args = parser.parse_args()
if args.config: if args.config:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
from pathlib import Path from pathlib import Path
import math import math
@ -39,13 +53,13 @@ class Experiment(ExperimentBase):
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.StepDecay( lr_scheduler = paddle.optimizer.lr.StepDecay(
config.training.lr, config.training.lr, config.training.anneal_interval,
config.training.anneal_interval,
config.training.anneal_rate) config.training.anneal_rate)
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
lr_scheduler, lr_scheduler,
parameters=model.parameters(), parameters=model.parameters(),
grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm)) grad_clip=paddle.nn.ClipGradByGlobalNorm(
config.training.gradient_max_norm))
self.model = model self.model = model
self.model_core = model._layer if self.parallel else model self.model_core = model._layer if self.parallel else model
@ -56,7 +70,8 @@ class Experiment(ExperimentBase):
args = self.args args = self.args
ljspeech_dataset = LJSpeech(args.data) ljspeech_dataset = LJSpeech(args.data)
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size) valid_set, train_set = dataset.split(ljspeech_dataset,
config.data.valid_size)
# convolutional net's causal padding size # convolutional net's causal padding size
context_size = config.model.n_stack \ context_size = config.model.n_stack \
@ -66,7 +81,8 @@ class Experiment(ExperimentBase):
# frames used to compute loss # frames used to compute loss
frames_per_second = config.data.sample_rate // config.data.hop_length frames_per_second = config.data.sample_rate // config.data.hop_length
train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second) train_clip_frames = math.ceil(config.data.train_clip_seconds *
frames_per_second)
num_frames = train_clip_frames + context_frames num_frames = train_clip_frames + context_frames
batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length) batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
@ -111,10 +127,12 @@ class Experiment(ExperimentBase):
loss_value = float(loss) loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank()) msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time) msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += "loss: {:>.6f}".format(loss_value) msg += "loss: {:>.6f}".format(loss_value)
self.logger.info(msg) self.logger.info(msg)
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration) self.visualizer.add_scalar(
"train/loss", loss_value, global_step=self.iteration)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
@paddle.no_grad() @paddle.no_grad()
@ -126,7 +144,8 @@ class Experiment(ExperimentBase):
loss = self.model.loss(y, wav) loss = self.model.loss(y, wav)
valid_losses.append(float(loss)) valid_losses.append(float(loss))
valid_loss = np.mean(valid_losses) valid_loss = np.mean(valid_losses)
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration) self.visualizer.add_scalar(
"valid/loss", valid_loss, global_step=self.iteration)
def main_sp(config, args): def main_sp(config, args):

View File

@ -18,15 +18,16 @@ import numpy as np
__all__ = ["AudioProcessor"] __all__ = ["AudioProcessor"]
class AudioProcessor(object): class AudioProcessor(object):
def __init__(self, def __init__(self,
sample_rate:int, sample_rate: int,
n_fft:int, n_fft: int,
win_length:int, win_length: int,
hop_length:int, hop_length: int,
n_mels:int=80, n_mels: int=80,
f_min:int=0, f_min: int=0,
f_max:int=None, f_max: int=None,
window="hann", window="hann",
center=True, center=True,
pad_mode="reflect"): pad_mode="reflect"):
@ -50,8 +51,7 @@ class AudioProcessor(object):
self.inv_mel_filter = np.linalg.pinv(self.mel_filter) self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
def _create_mel_filter(self): def _create_mel_filter(self):
mel_filter = librosa.filters.mel( mel_filter = librosa.filters.mel(self.sample_rate,
self.sample_rate,
self.n_fft, self.n_fft,
n_mels=self.n_mels, n_mels=self.n_mels,
fmin=self.f_min, fmin=self.f_min,
@ -69,7 +69,7 @@ class AudioProcessor(object):
def stft(self, wav): def stft(self, wav):
D = librosa.core.stft( D = librosa.core.stft(
wav, wav,
n_fft = self.n_fft, n_fft=self.n_fft,
hop_length=self.hop_length, hop_length=self.hop_length,
win_length=self.win_length, win_length=self.win_length,
window=self.window, window=self.window,

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
This modules contains normalizers for spectrogram magnitude. This modules contains normalizers for spectrogram magnitude.
@ -23,10 +36,12 @@ class NormalizerBase(object):
def inverse(self, normalized): def inverse(self, normalized):
raise NotImplementedError("inverse must be implemented") raise NotImplementedError("inverse must be implemented")
class LogMagnitude(NormalizerBase): class LogMagnitude(NormalizerBase):
""" """
This is a simple normalizer used in Waveglow, Waveflow, tacotron2... This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
""" """
def __init__(self, min=1e-7): def __init__(self, min=1e-7):
self.min = min self.min = min
@ -44,6 +59,7 @@ class UnitMagnitude(NormalizerBase):
""" """
This is the normalizer used in the This is the normalizer used in the
""" """
def __init__(self, min=1e-5): def __init__(self, min=1e-5):
self.min = min self.min = min

View File

@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided.
import numpy as np import numpy as np
__all__ = [ __all__ = [
"batch_text_id", "batch_wav", "batch_spec", "batch_text_id",
"TextIDBatcher", "WavBatcher", "SpecBatcher", "batch_wav",
"batch_spec",
"TextIDBatcher",
"WavBatcher",
"SpecBatcher",
] ]
class TextIDBatcher(object): class TextIDBatcher(object):
"""A wrapper class for `batch_text_id`.""" """A wrapper class for `batch_text_id`."""
@ -113,7 +118,11 @@ class SpecBatcher(object):
self.time_major = time_major self.time_major = time_major
def __call__(self, minibatch): def __call__(self, minibatch):
out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype) out = batch_spec(
minibatch,
pad_value=self.pad_value,
time_major=self.time_major,
dtype=self.dtype)
return out return out
@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
""" """
# assume (F, T) or (T, F) # assume (F, T) or (T, F)
peek_example = minibatch[0] peek_example = minibatch[0]
assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram" assert len(
peek_example.shape) == 2, "we only handles mono channel spectrogram"
# assume (F, n_frame) or (n_frame, F) # assume (F, n_frame) or (n_frame, F)
time_idx = 0 if time_major else -1 time_idx = 0 if time_major else -1

View File

@ -17,17 +17,25 @@ import paddle
from paddle.io import Dataset from paddle.io import Dataset
__all__ = [ __all__ = [
"split", "TransformDataset", "CacheDataset", "TupleDataset", "split",
"DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset", "TransformDataset",
"CacheDataset",
"TupleDataset",
"DictDataset",
"SliceDataset",
"SubsetDataset",
"FilterDataset",
"ChainDataset", "ChainDataset",
] ]
def split(dataset, first_size): def split(dataset, first_size):
"""A utility function to split a dataset into two datasets.""" """A utility function to split a dataset into two datasets."""
first = SliceDataset(dataset, 0, first_size) first = SliceDataset(dataset, 0, first_size)
second = SliceDataset(dataset, first_size, len(dataset)) second = SliceDataset(dataset, first_size, len(dataset))
return first, second return first, second
class TransformDataset(Dataset): class TransformDataset(Dataset):
def __init__(self, dataset, transform): def __init__(self, dataset, transform):
"""Dataset which is transformed from another with a transform. """Dataset which is transformed from another with a transform.

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets.common import * from parakeet.datasets.common import *
from parakeet.datasets.ljspeech import * from parakeet.datasets.ljspeech import *

View File

@ -1,9 +1,24 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.io import Dataset from paddle.io import Dataset
import os import os
import librosa import librosa
__all__ = ["AudioFolderDataset"] __all__ = ["AudioFolderDataset"]
class AudioFolderDataset(Dataset): class AudioFolderDataset(Dataset):
def __init__(self, path, sample_rate, extension="wav"): def __init__(self, path, sample_rate, extension="wav"):
self.root = os.path.expanduser(path) self.root = os.path.expanduser(path)

View File

@ -1,8 +1,23 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.io import Dataset from paddle.io import Dataset
from pathlib import Path from pathlib import Path
__all__ = ["LJSpeechMetaData"] __all__ = ["LJSpeechMetaData"]
class LJSpeechMetaData(Dataset): class LJSpeechMetaData(Dataset):
def __init__(self, root): def __init__(self, root):
self.root = Path(root).expanduser() self.root = Path(root).expanduser()
@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset):
def __len__(self): def __len__(self):
return len(self.records) return len(self.records)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.vocab import * from parakeet.frontend.vocab import *
from parakeet.frontend.phonectic import * from parakeet.frontend.phonectic import *
from parakeet.frontend.punctuation import * from parakeet.frontend.punctuation import *

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.normalizer.normalizer import * from parakeet.frontend.normalizer.normalizer import *
from parakeet.frontend.normalizer.numbers import * from parakeet.frontend.normalizer.numbers import *

View File

@ -0,0 +1,14 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,14 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def full2half_width(ustr): def full2half_width(ustr):
half = [] half = []
for u in ustr: for u in ustr:
@ -10,6 +24,7 @@ def full2half_width(ustr):
half.append(u) half.append(u)
return ''.join(half) return ''.join(half)
def half2full_width(ustr): def half2full_width(ustr):
full = [] full = []
for u in ustr: for u in ustr:

View File

@ -17,7 +17,8 @@ from typing import Union
from g2p_en import G2p from g2p_en import G2p
from g2pM import G2pM from g2pM import G2pM
from parakeet.frontend import Vocab from parakeet.frontend import Vocab
from opencc import OpenCC # discard opencc untill we find an easy solution to install it on windows
# from opencc import OpenCC
from parakeet.frontend.punctuation import get_punctuations from parakeet.frontend.punctuation import get_punctuations
from parakeet.frontend.normalizer.normalizer import normalize from parakeet.frontend.normalizer.normalizer import normalize
@ -211,7 +212,7 @@ class Chinese(Phonetics):
""" """
def __init__(self): def __init__(self):
self.opencc_backend = OpenCC('t2s.json') # self.opencc_backend = OpenCC('t2s.json')
self.backend = G2pM() self.backend = G2pM()
self.phonemes = self._get_all_syllables() self.phonemes = self._get_all_syllables()
self.punctuations = get_punctuations("cn") self.punctuations = get_punctuations("cn")
@ -236,7 +237,8 @@ class Chinese(Phonetics):
List[str] List[str]
The list of pronunciation sequence. The list of pronunciation sequence.
""" """
simplified = self.opencc_backend.convert(sentence) # simplified = self.opencc_backend.convert(sentence)
simplified = sentence
phonemes = self.backend(simplified) phonemes = self.backend(simplified)
start = self.vocab.start_symbol start = self.vocab.start_symbol
end = self.vocab.end_symbol end = self.vocab.end_symbol

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc import abc
import string import string
@ -13,15 +27,8 @@ EN_PUNCT = [
"!", "!",
] ]
CN_PUNCT = [ CN_PUNCT = ["", "", "", "", "", "", ""]
"",
"",
"",
"",
"",
"",
""
]
def get_punctuations(lang): def get_punctuations(lang):
if lang == "en": if lang == "en":
@ -30,4 +37,3 @@ def get_punctuations(lang):
return CN_PUNCT return CN_PUNCT
else: else:
raise ValueError(f"language {lang} Not supported") raise ValueError(f"language {lang} Not supported")

View File

@ -575,7 +575,8 @@ class TransformerTTS(nn.Layer):
decoder_prenet_dropout=config.model.decoder_prenet_dropout, decoder_prenet_dropout=config.model.decoder_prenet_dropout,
dropout=config.model.dropout) dropout=config.model.dropout)
iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path) iteration = checkpoint.load_parameters(
model, checkpoint_path=checkpoint_path)
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads) drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
reduction_factor = scheduler.StepWise(config.training.reduction_factor) reduction_factor = scheduler.StepWise(config.training.reduction_factor)
model.set_constants( model.set_constants(

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import numpy as np import numpy as np
from typing import List, Union, Tuple from typing import List, Union, Tuple
@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
def fold(x, n_group): def fold(x, n_group):
r"""Fold audio or spectrogram's temporal dimension in to groups. r"""Fold audio or spectrogram's temporal dimension in to groups.
@ -31,6 +46,7 @@ def fold(x, n_group):
new_shape = spatial_shape + [time_steps // n_group, n_group] new_shape = spatial_shape + [time_steps // n_group, n_group]
return paddle.reshape(x, new_shape) return paddle.reshape(x, new_shape)
class UpsampleNet(nn.LayerList): class UpsampleNet(nn.LayerList):
"""Layer to upsample mel spectrogram to the same temporal resolution with """Layer to upsample mel spectrogram to the same temporal resolution with
the corresponding waveform. the corresponding waveform.
@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList):
--------- ---------
``librosa.core.stft`` ``librosa.core.stft``
""" """
def __init__(self, upsample_factors): def __init__(self, upsample_factors):
super(UpsampleNet, self).__init__() super(UpsampleNet, self).__init__()
for factor in upsample_factors: for factor in upsample_factors:
@ -67,7 +84,9 @@ class UpsampleNet(nn.LayerList):
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
self.append( self.append(
nn.utils.weight_norm( nn.utils.weight_norm(
nn.Conv2DTranspose(1, 1, (3, 2 * factor), nn.Conv2DTranspose(
1,
1, (3, 2 * factor),
padding=(1, factor // 2), padding=(1, factor // 2),
stride=(1, factor), stride=(1, factor),
weight_attr=init, weight_attr=init,
@ -131,15 +150,21 @@ class ResidualBlock(nn.Layer):
dilations : int dilations : int
Dilations of the Convolution2d applied to the input. Dilations of the Convolution2d applied to the input.
""" """
def __init__(self, channels, cond_channels, kernel_size, dilations): def __init__(self, channels, cond_channels, kernel_size, dilations):
super(ResidualBlock, self).__init__() super(ResidualBlock, self).__init__()
# input conv # input conv
std = math.sqrt(1 / channels * np.prod(kernel_size)) std = math.sqrt(1 / channels * np.prod(kernel_size))
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)] receptive_field = [
1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
]
rh, rw = receptive_field rh, rw = receptive_field
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
conv = nn.Conv2D(channels, 2 * channels, kernel_size, conv = nn.Conv2D(
channels,
2 * channels,
kernel_size,
padding=paddings, padding=paddings,
dilation=dilations, dilation=dilations,
weight_attr=init, weight_attr=init,
@ -152,15 +177,18 @@ class ResidualBlock(nn.Layer):
# condition projection # condition projection
std = math.sqrt(1 / cond_channels) std = math.sqrt(1 / cond_channels)
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1), condition_proj = nn.Conv2D(
weight_attr=init, bias_attr=init) cond_channels,
2 * channels, (1, 1),
weight_attr=init,
bias_attr=init)
self.condition_proj = nn.utils.weight_norm(condition_proj) self.condition_proj = nn.utils.weight_norm(condition_proj)
# parametric residual & skip connection # parametric residual & skip connection
std = math.sqrt(1 / channels) std = math.sqrt(1 / channels)
init = I.Uniform(-std, std) init = I.Uniform(-std, std)
out_proj = nn.Conv2D(channels, 2 * channels, (1, 1), out_proj = nn.Conv2D(
weight_attr=init, bias_attr=init) channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
self.out_proj = nn.utils.weight_norm(out_proj) self.out_proj = nn.utils.weight_norm(out_proj)
def forward(self, x, condition): def forward(self, x, condition):
@ -290,6 +318,7 @@ class ResidualNet(nn.LayerList):
ValueError ValueError
If the length of dilations_h does not equals n_layers. If the length of dilations_h does not equals n_layers.
""" """
def __init__(self, def __init__(self,
n_layer: int, n_layer: int,
residual_channels: int, residual_channels: int,
@ -297,11 +326,13 @@ class ResidualNet(nn.LayerList):
kernel_size: Tuple[int], kernel_size: Tuple[int],
dilations_h: List[int]): dilations_h: List[int]):
if len(dilations_h) != n_layer: if len(dilations_h) != n_layer:
raise ValueError("number of dilations_h should equals num of layers") raise ValueError(
"number of dilations_h should equals num of layers")
super(ResidualNet, self).__init__() super(ResidualNet, self).__init__()
for i in range(n_layer): for i in range(n_layer):
dilation = (dilations_h[i], 2 ** i) dilation = (dilations_h[i], 2**i)
layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation) layer = ResidualBlock(residual_channels, condition_channels,
kernel_size, dilation)
self.append(layer) self.append(layer)
def forward(self, x, condition): def forward(self, x, condition):
@ -397,7 +428,9 @@ class Flow(nn.Layer):
super(Flow, self).__init__() super(Flow, self).__init__()
# input projection # input projection
self.input_proj = nn.utils.weight_norm( self.input_proj = nn.utils.weight_norm(
nn.Conv2D(1, channels, (1, 1), nn.Conv2D(
1,
channels, (1, 1),
weight_attr=I.Uniform(-1., 1.), weight_attr=I.Uniform(-1., 1.),
bias_attr=I.Uniform(-1., 1.))) bias_attr=I.Uniform(-1., 1.)))
@ -406,7 +439,9 @@ class Flow(nn.Layer):
self.dilations_dict[n_group]) self.dilations_dict[n_group])
# output projection # output projection
self.output_proj = nn.Conv2D(channels, 2, (1, 1), self.output_proj = nn.Conv2D(
channels,
2, (1, 1),
weight_attr=I.Constant(0.), weight_attr=I.Constant(0.),
bias_attr=I.Constant(0.)) bias_attr=I.Constant(0.))
@ -452,8 +487,8 @@ class Flow(nn.Layer):
transformation from x to z. transformation from x to z.
""" """
# (B, C, H-1, W) # (B, C, H-1, W)
logs, b = self._predict_parameters( logs, b = self._predict_parameters(x[:, :, :-1, :],
x[:, :, :-1, :], condition[:, :, 1:, :]) condition[:, :, 1:, :])
z = self._transform(x, logs, b) z = self._transform(x, logs, b)
return z, (logs, b) return z, (logs, b)
@ -511,10 +546,11 @@ class Flow(nn.Layer):
self._start_sequence() self._start_sequence()
for i in range(1, self.n_group): for i in range(1, self.n_group):
x_row = x[-1] # actuallt i-1:i x_row = x[-1] # actuallt i-1:i
z_row = z[:, :, i:i+1, :] z_row = z[:, :, i:i + 1, :]
condition_row = condition[:, :, i:i+1, :] condition_row = condition[:, :, i:i + 1, :]
x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row) x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
condition_row)
x.append(x_next_row) x.append(x_next_row)
logs_list.append(logs) logs_list.append(logs)
b_list.append(b) b_list.append(b)
@ -549,13 +585,17 @@ class WaveFlow(nn.LayerList):
kernel_size : Union[int, List[int]] kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock. Kernel size of the convolution layer in each ResidualBlock.
""" """
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
kernel_size):
if n_group % 2 or n_flows % 2: if n_group % 2 or n_flows % 2:
raise ValueError("number of flows and number of group must be even " raise ValueError(
"number of flows and number of group must be even "
"since a permutation along group among flows is used.") "since a permutation along group among flows is used.")
super(WaveFlow, self).__init__() super(WaveFlow, self).__init__()
for _ in range(n_flows): for _ in range(n_flows):
self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group)) self.append(
Flow(n_layers, channels, mel_bands, kernel_size, n_group))
# permutations in h # permutations in h
self.perms = self._create_perm(n_group, n_flows) self.perms = self._create_perm(n_group, n_flows)
@ -572,7 +612,8 @@ class WaveFlow(nn.LayerList):
if i < n_flows // 2: if i < n_flows // 2:
perms.append(indices[::-1]) perms.append(indices[::-1])
else: else:
perm = list(reversed(indices[:half])) + list(reversed(indices[half:])) perm = list(reversed(indices[:half])) + list(
reversed(indices[half:]))
perms.append(perm) perms.append(perm)
return perms return perms
@ -612,8 +653,10 @@ class WaveFlow(nn.LayerList):
x, condition = self._trim(x, condition) x, condition = self._trim(x, condition)
# to (B, C, h, T//h) layout # to (B, C, h, T//h) layout
x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1) x = paddle.unsqueeze(
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
condition = paddle.transpose(
fold(condition, self.n_group), [0, 1, 3, 2])
# flows # flows
logs_list = [] logs_list = []
@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList):
z, condition = self._trim(z, condition) z, condition = self._trim(z, condition)
# to (B, C, h, T//h) layout # to (B, C, h, T//h) layout
z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1) z = paddle.unsqueeze(
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2]) paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
condition = paddle.transpose(
fold(condition, self.n_group), [0, 1, 3, 2])
# reverse it flow by flow # reverse it flow by flow
for i in reversed(range(self.n_flows)): for i in reversed(range(self.n_flows)):
@ -695,6 +740,7 @@ class ConditionalWaveFlow(nn.LayerList):
kernel_size : Union[int, List[int]] kernel_size : Union[int, List[int]]
Kernel size of the convolution layer in each ResidualBlock. Kernel size of the convolution layer in each ResidualBlock.
""" """
def __init__(self, def __init__(self,
upsample_factors: List[int], upsample_factors: List[int],
n_flows: int, n_flows: int,
@ -795,8 +841,7 @@ class ConditionalWaveFlow(nn.LayerList):
ConditionalWaveFlow ConditionalWaveFlow
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(upsample_factors=config.model.upsample_factors,
upsample_factors=config.model.upsample_factors,
n_flows=config.model.n_flows, n_flows=config.model.n_flows,
n_layers=config.model.n_layers, n_layers=config.model.n_layers,
n_group=config.model.n_group, n_group=config.model.n_group,
@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer):
The standard deviation of the gaussian noise used in WaveFlow, by The standard deviation of the gaussian noise used in WaveFlow, by
default 1.0. default 1.0.
""" """
def __init__(self, sigma=1.0): def __init__(self, sigma=1.0):
super(WaveFlowLoss, self).__init__() super(WaveFlowLoss, self).__init__()
self.sigma = sigma self.sigma = sigma
@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer):
Tensor [shape=(1,)] Tensor [shape=(1,)]
The loss. The loss.
""" """
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
) - log_det_jacobian
loss = loss / np.prod(z.shape) loss = loss / np.prod(z.shape)
return loss + self.const return loss + self.const

View File

@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools
__all__ = ["WaveNet", "ConditionalWaveNet"] __all__ = ["WaveNet", "ConditionalWaveNet"]
def crop(x, audio_start, audio_length): def crop(x, audio_start, audio_length):
"""Crop the upsampled condition to match audio_length. """Crop the upsampled condition to match audio_length.
@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList):
--------- ---------
``librosa.core.stft`` ``librosa.core.stft``
""" """
def __init__(self, upscale_factors=[16, 16]): def __init__(self, upscale_factors=[16, 16]):
super(UpsampleNet, self).__init__() super(UpsampleNet, self).__init__()
self.upscale_factors = list(upscale_factors) self.upscale_factors = list(upscale_factors)
@ -106,7 +108,9 @@ class UpsampleNet(nn.LayerList):
for factor in self.upscale_factors: for factor in self.upscale_factors:
self.append( self.append(
nn.utils.weight_norm( nn.utils.weight_norm(
nn.Conv2DTranspose(1, 1, nn.Conv2DTranspose(
1,
1,
kernel_size=(3, 2 * factor), kernel_size=(3, 2 * factor),
stride=(1, factor), stride=(1, factor),
padding=(1, factor // 2)))) padding=(1, factor // 2))))
@ -159,6 +163,7 @@ class ResidualBlock(nn.Layer):
dilation :int dilation :int
Dilation of the internal convolution cells. Dilation of the internal convolution cells.
""" """
def __init__(self, def __init__(self,
residual_channels: int, residual_channels: int,
condition_dim: int, condition_dim: int,
@ -170,9 +175,11 @@ class ResidualBlock(nn.Layer):
# following clarinet's implementation, we do not have parametric residual # following clarinet's implementation, we do not have parametric residual
# & skip connection. # & skip connection.
_filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size _filter_size = filter_size[0] if isinstance(filter_size, (
list, tuple)) else filter_size
std = math.sqrt(1 / (_filter_size * residual_channels)) std = math.sqrt(1 / (_filter_size * residual_channels))
conv = Conv1dCell(residual_channels, conv = Conv1dCell(
residual_channels,
dilated_channels, dilated_channels,
filter_size, filter_size,
dilation=dilation, dilation=dilation,
@ -180,7 +187,9 @@ class ResidualBlock(nn.Layer):
self.conv = nn.utils.weight_norm(conv) self.conv = nn.utils.weight_norm(conv)
std = math.sqrt(1 / condition_dim) std = math.sqrt(1 / condition_dim)
condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,), condition_proj = Conv1dCell(
condition_dim,
dilated_channels, (1, ),
weight_attr=I.Normal(scale=std)) weight_attr=I.Normal(scale=std))
self.condition_proj = nn.utils.weight_norm(condition_proj) self.condition_proj = nn.utils.weight_norm(condition_proj)
@ -309,6 +318,7 @@ class ResidualNet(nn.LayerList):
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``. Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
""" """
def __init__(self, def __init__(self,
n_stack: int, n_stack: int,
n_loop: int, n_loop: int,
@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList):
dilations = [2**i for i in range(n_loop)] * n_stack dilations = [2**i for i in range(n_loop)] * n_stack
self.context_size = 1 + sum(dilations) self.context_size = 1 + sum(dilations)
for dilation in dilations: for dilation in dilations:
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation)) self.append(
ResidualBlock(residual_channels, condition_dim, filter_size,
dilation))
def forward(self, x, condition=None): def forward(self, x, condition=None):
"""Forward pass of ``ResidualNet``. """Forward pass of ``ResidualNet``.
@ -426,6 +438,7 @@ class WaveNet(nn.Layer):
This is only used for computing loss when ``loss_type`` is "mog", If This is only used for computing loss when ``loss_type`` is "mog", If
the predicted log scale is less than -9.0, it is clipped at -9.0. the predicted log scale is less than -9.0, it is clipped at -9.0.
""" """
def __init__(self, n_stack, n_loop, residual_channels, output_dim, def __init__(self, n_stack, n_loop, residual_channels, output_dim,
condition_dim, filter_size, loss_type, log_scale_min): condition_dim, filter_size, loss_type, log_scale_min):
@ -437,19 +450,24 @@ class WaveNet(nn.Layer):
else: else:
if (output_dim % 3 != 0): if (output_dim % 3 != 0):
raise ValueError( raise ValueError(
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim)) "with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".
self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1) format(output_dim))
self.embed = nn.utils.weight_norm(
nn.Linear(1, residual_channels), dim=1)
self.resnet = ResidualNet(n_stack, n_loop, residual_channels, self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
condition_dim, filter_size) condition_dim, filter_size)
self.context_size = self.resnet.context_size self.context_size = self.resnet.context_size
skip_channels = residual_channels # assume the same channel skip_channels = residual_channels # assume the same channel
self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) self.proj1 = nn.utils.weight_norm(
self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1) nn.Linear(skip_channels, skip_channels), dim=1)
self.proj2 = nn.utils.weight_norm(
nn.Linear(skip_channels, skip_channels), dim=1)
# if loss_type is softmax, output_dim is n_vocab of waveform magnitude. # if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
# if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev) # if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1) self.proj3 = nn.utils.weight_norm(
nn.Linear(skip_channels, output_dim), dim=1)
self.loss_type = loss_type self.loss_type = loss_type
self.output_dim = output_dim self.output_dim = output_dim
@ -781,6 +799,7 @@ class ConditionalWaveNet(nn.Layer):
This is only used for computing loss when ``loss_type`` is "mog", If This is only used for computing loss when ``loss_type`` is "mog", If
the predicted log scale is less than -9.0, it is clipped at -9.0. the predicted log scale is less than -9.0, it is clipped at -9.0.
""" """
def __init__(self, def __init__(self,
upsample_factors: List[int], upsample_factors: List[int],
n_stack: int, n_stack: int,
@ -793,7 +812,8 @@ class ConditionalWaveNet(nn.Layer):
log_scale_min: float=-9.0): log_scale_min: float=-9.0):
super(ConditionalWaveNet, self).__init__() super(ConditionalWaveNet, self).__init__()
self.encoder = UpsampleNet(upsample_factors) self.encoder = UpsampleNet(upsample_factors)
self.decoder = WaveNet(n_stack=n_stack, self.decoder = WaveNet(
n_stack=n_stack,
n_loop=n_loop, n_loop=n_loop,
residual_channels=residual_channels, residual_channels=residual_channels,
output_dim=output_dim, output_dim=output_dim,
@ -943,8 +963,7 @@ class ConditionalWaveNet(nn.Layer):
ConditionalWaveNet ConditionalWaveNet
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(upsample_factors=config.model.upsample_factors,
upsample_factors=config.model.upsample_factors,
n_stack=config.model.n_stack, n_stack=config.model.n_stack,
n_loop=config.model.n_loop, n_loop=config.model.n_loop,
residual_channels=config.model.residual_channels, residual_channels=config.model.residual_channels,

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
@ -86,6 +100,7 @@ class STFT(nn.Layer):
Ony ``center`` and ``reflect`` padding is supported now. Ony ``center`` and ``reflect`` padding is supported now.
""" """
def __init__(self, n_fft, hop_length, win_length, window="hanning"): def __init__(self, n_fft, hop_length, win_length, window="hanning"):
super(STFT, self).__init__() super(STFT, self).__init__()
self.hop_length = hop_length self.hop_length = hop_length
@ -109,7 +124,8 @@ class STFT(nn.Layer):
(self.n_bin, 1, 1, self.n_fft)) (self.n_bin, 1, 1, self.n_fft))
w = np.concatenate([w_real, w_imag], axis=0) w = np.concatenate([w_real, w_imag], axis=0)
self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) self.weight = paddle.cast(
paddle.to_tensor(w), paddle.get_default_dtype())
def forward(self, x): def forward(self, x):
"""Compute the stft transform. """Compute the stft transform.

View File

@ -20,6 +20,7 @@ __all__ = [
"Conv1dBatchNorm", "Conv1dBatchNorm",
] ]
class Conv1dCell(nn.Conv1D): class Conv1dCell(nn.Conv1D):
"""A subclass of Conv1D layer, which can be used in an autoregressive """A subclass of Conv1D layer, which can be used in an autoregressive
decoder like an RNN cell. decoder like an RNN cell.
@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer):
epsilon : [type], optional epsilon : [type], optional
The epsilon of the BatchNorm1D layer, by default 1e-05 The epsilon of the BatchNorm1D layer, by default 1e-05
""" """
def __init__(self, def __init__(self,
in_channels, in_channels,
out_channels, out_channels,

View File

@ -1,6 +1,21 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import paddle import paddle
def shuffle_dim(x, axis, perm=None): def shuffle_dim(x, axis, perm=None):
"""Permute input tensor along aixs given the permutation or randomly. """Permute input tensor along aixs given the permutation or randomly.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numba import numba
import numpy as np import numpy as np
import paddle import paddle
@ -11,6 +25,7 @@ __all__ = [
"diagonal_loss", "diagonal_loss",
] ]
def weighted_mean(input, weight): def weighted_mean(input, weight):
"""Weighted mean. It can also be used as masked mean. """Weighted mean. It can also be used as masked mean.
@ -88,8 +103,7 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
return loss return loss
def diagonal_loss( def diagonal_loss(attentions,
attentions,
input_lengths, input_lengths,
target_lengths, target_lengths,
g=0.2, g=0.2,
@ -133,6 +147,7 @@ def diagonal_loss(
else: else:
return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1)) return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
@numba.jit(nopython=True) @numba.jit(nopython=True)
def guided_attention(N, max_N, T, max_T, g): def guided_attention(N, max_N, T, max_T, g):
W = np.zeros((max_T, max_N), dtype=np.float32) W = np.zeros((max_T, max_N), dtype=np.float32)
@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g):
# (T_dec, T_enc) # (T_dec, T_enc)
return W return W
def guided_attentions(input_lengths, target_lengths, g=0.2): def guided_attentions(input_lengths, target_lengths, g=0.2):
B = len(input_lengths) B = len(input_lengths)
max_input_len = input_lengths.max() max_input_len = input_lengths.max()

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle import paddle
from paddle.fluid.layers import sequence_mask from paddle.fluid.layers import sequence_mask
@ -8,6 +22,7 @@ __all__ = [
"future_mask", "future_mask",
] ]
def id_mask(input, padding_index=0, dtype="bool"): def id_mask(input, padding_index=0, dtype="bool"):
"""Generate mask with input ids. """Generate mask with input ids.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import numpy as np import numpy as np
import paddle import paddle
@ -5,6 +19,7 @@ from paddle.nn import functional as F
__all__ = ["positional_encoding"] __all__ = ["positional_encoding"]
def positional_encoding(start_index, length, size, dtype=None): def positional_encoding(start_index, length, size, dtype=None):
r"""Generate standard positional encoding matrix. r"""Generate standard positional encoding matrix.
@ -37,7 +52,7 @@ def positional_encoding(start_index, length, size, dtype=None):
dtype = dtype or paddle.get_default_dtype() dtype = dtype or paddle.get_default_dtype()
channel = np.arange(0, size, 2) channel = np.arange(0, size, 2)
index = np.arange(start_index, start_index + length, 1) index = np.arange(start_index, start_index + length, 1)
p = np.expand_dims(index, -1) / (10000 ** (channel / float(size))) p = np.expand_dims(index, -1) / (10000**(channel / float(size)))
encodings = np.zeros([length, size]) encodings = np.zeros([length, size])
encodings[:, 0::2] = np.sin(p) encodings[:, 0::2] = np.sin(p)
encodings[:, 1::2] = np.cos(p) encodings[:, 1::2] = np.cos(p)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle import paddle
from paddle import nn from paddle import nn
@ -12,6 +26,7 @@ __all__ = [
"TransformerDecoderLayer", "TransformerDecoderLayer",
] ]
class PositionwiseFFN(nn.Layer): class PositionwiseFFN(nn.Layer):
"""A faithful implementation of Position-wise Feed-Forward Network """A faithful implementation of Position-wise Feed-Forward Network
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_. in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer):
The probability of the Dropout applied to the output of the first The probability of the Dropout applied to the output of the first
layer, by default 0. layer, by default 0.
""" """
def __init__(self,
input_size: int, def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
hidden_size: int,
dropout=0.0):
super(PositionwiseFFN, self).__init__() super(PositionwiseFFN, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size) self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, input_size) self.linear2 = nn.Linear(hidden_size, input_size)
@ -86,6 +99,7 @@ class TransformerEncoderLayer(nn.Layer):
------ ------
It uses the PostLN (post layer norm) scheme. It uses the PostLN (post layer norm) scheme.
""" """
def __init__(self, d_model, n_heads, d_ffn, dropout=0.): def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
super(TransformerEncoderLayer, self).__init__() super(TransformerEncoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer):
""" """
context_vector, attn_weights = self.self_mha(x, x, x, mask) context_vector, attn_weights = self.self_mha(x, x, x, mask)
x = self.layer_norm1( x = self.layer_norm1(
F.dropout(x + context_vector, F.dropout(
self.dropout, x + context_vector, self.dropout, training=self.training))
training=self.training))
x = self.layer_norm2( x = self.layer_norm2(
F.dropout(x + self.ffn(x), F.dropout(
self.dropout, x + self.ffn(x), self.dropout, training=self.training))
training=self.training))
return x, attn_weights return x, attn_weights
@ -155,6 +167,7 @@ class TransformerDecoderLayer(nn.Layer):
------ ------
It uses the PostLN (post layer norm) scheme. It uses the PostLN (post layer norm) scheme.
""" """
def __init__(self, d_model, n_heads, d_ffn, dropout=0.): def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
super(TransformerDecoderLayer, self).__init__() super(TransformerDecoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer):
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
Decoder-encoder cross attention. Decoder-encoder cross attention.
""" """
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) context_vector, self_attn_weights = self.self_mha(q, q, q,
decoder_mask)
q = self.layer_norm1( q = self.layer_norm1(
F.dropout(q + context_vector, F.dropout(
self.dropout, q + context_vector, self.dropout, training=self.training))
training=self.training))
context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask) context_vector, cross_attn_weights = self.cross_mha(q, k, v,
encoder_mask)
q = self.layer_norm2( q = self.layer_norm2(
F.dropout(q + context_vector, F.dropout(
self.dropout, q + context_vector, self.dropout, training=self.training))
training=self.training))
q = self.layer_norm3( q = self.layer_norm3(
F.dropout(q + self.ffn(q), F.dropout(
self.dropout, q + self.ffn(q), self.dropout, training=self.training))
training=self.training))
return q, self_attn_weights, cross_attn_weights return q, self_attn_weights, cross_attn_weights

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.training.cli import * from parakeet.training.cli import *
from parakeet.training.experiment import * from parakeet.training.experiment import *

View File

@ -1,5 +1,20 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
def default_argument_parser(): def default_argument_parser():
r"""A simple yet genral argument parser for experiments with parakeet. r"""A simple yet genral argument parser for experiments with parakeet.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode from yacs.config import CfgNode
_C = CfgNode( _C = CfgNode(
@ -5,8 +19,8 @@ _C = CfgNode(
valid_interval=1000, # validation valid_interval=1000, # validation
save_interval=10000, # checkpoint save_interval=10000, # checkpoint
max_iteration=900000, # max iteration to train max_iteration=900000, # max iteration to train
) ))
)
def get_default_training_config(): def get_default_training_config():
return _C.clone() return _C.clone()

View File

@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools
__all__ = ["ExperimentBase"] __all__ = ["ExperimentBase"]
class ExperimentBase(object): class ExperimentBase(object):
""" """
An experiment template in order to structure the training code and take An experiment template in order to structure the training code and take

View File

@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
return iteration return iteration
def _save_checkpoint(checkpoint_dir: str, iteration: int): def _save_checkpoint(checkpoint_dir: str, iteration: int):
"""Save the iteration number of the latest model to be checkpointed. """Save the iteration number of the latest model to be checkpointed.
@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
with open(checkpoint_record, "wt") as handle: with open(checkpoint_record, "wt") as handle:
handle.write("model_checkpoint_path: step-{}".format(iteration)) handle.write("model_checkpoint_path: step-{}".format(iteration))
def load_parameters(model, def load_parameters(model,
optimizer=None, optimizer=None,
checkpoint_dir=None, checkpoint_dir=None,
@ -97,18 +99,19 @@ def load_parameters(model,
params_path = checkpoint_path + ".pdparams" params_path = checkpoint_path + ".pdparams"
model_dict = paddle.load(params_path) model_dict = paddle.load(params_path)
model.set_state_dict(model_dict) model.set_state_dict(model_dict)
print("[checkpoint] Rank {}: loaded model from {}".format( print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
local_rank, params_path)) params_path))
optimizer_path = checkpoint_path + ".pdopt" optimizer_path = checkpoint_path + ".pdopt"
if optimizer and os.path.isfile(optimizer_path): if optimizer and os.path.isfile(optimizer_path):
optimizer_dict = paddle.load(optimizer_path) optimizer_dict = paddle.load(optimizer_path)
optimizer.set_state_dict(optimizer_dict) optimizer.set_state_dict(optimizer_dict)
print("[checkpoint] Rank {}: loaded optimizer state from {}". print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
format(local_rank, optimizer_path)) local_rank, optimizer_path))
return iteration return iteration
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def save_parameters(checkpoint_dir, iteration, model, optimizer=None): def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
"""Checkpoint the latest trained model parameters. """Checkpoint the latest trained model parameters.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle.framework import core from paddle.framework import core

View File

@ -28,6 +28,7 @@ def summary(layer: nn.Layer):
print("layer has {} parameters, {} elements.".format(num_params, print("layer has {} parameters, {} elements.".format(num_params,
num_elements)) num_elements))
def gradient_norm(layer: nn.Layer): def gradient_norm(layer: nn.Layer):
grad_norm_dict = {} grad_norm_dict = {}
for name, param in layer.state_dict().items(): for name, param in layer.state_dict().items():
@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer):
grad_norm_dict[name] = np.linalg.norm(grad) / grad.size grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
return grad_norm_dict return grad_norm_dict
def recursively_remove_weight_norm(layer: nn.Layer): def recursively_remove_weight_norm(layer: nn.Layer):
for layer in layer.sublayers(): for layer in layer.sublayers():
try: try:
@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer):
# ther is not weight norm hoom in this layer # ther is not weight norm hoom in this layer
pass pass
def freeze(layer: nn.Layer): def freeze(layer: nn.Layer):
for param in layer.parameters(): for param in layer.parameters():
param.trainable = False param.trainable = False
def unfreeze(layer: nn.Layer): def unfreeze(layer: nn.Layer):
for param in layer.parameters(): for param in layer.parameters():
param.trainable = True param.trainable = True

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from functools import wraps from functools import wraps
@ -16,6 +30,3 @@ def rank_zero_only(func):
return result return result
return wrapper return wrapper

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
__all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"] __all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
@ -34,8 +48,8 @@ class PieceWise(SchedulerBase):
return self.ys[0] return self.ys[0]
if i == self.num_anchors: if i == self.num_anchors:
return self.ys[-1] return self.ys[-1]
k = (self.ys[i] - self.ys[i-1]) / (self.xs[i] - self.xs[i-1]) k = (self.ys[i] - self.ys[i - 1]) / (self.xs[i] - self.xs[i - 1])
out = self.ys[i-1] + (step - self.xs[i-1]) * k out = self.ys[i - 1] + (step - self.xs[i - 1]) * k
return out return out
@ -58,5 +72,4 @@ class StepWise(SchedulerBase):
return self.ys[-1] return self.ys[-1]
if i == 0: if i == 0:
return self.ys[0] return self.ys[0]
return self.ys[i-1] return self.ys[i - 1]

View File

@ -48,7 +48,6 @@ setup_info = dict(
description='Speech synthesis tools and models based on Paddlepaddle', description='Speech synthesis tools and models based on Paddlepaddle',
long_description=long_description, long_description=long_description,
license='Apache 2', license='Apache 2',
python_requires='>=3.6', python_requires='>=3.6',
install_requires=[ install_requires=[
'numpy', 'numpy',
@ -64,30 +63,25 @@ setup_info = dict(
'scipy', 'scipy',
'pandas', 'pandas',
'sox', 'sox',
'opencc', # 'opencc',
'soundfile', 'soundfile',
'g2p_en', 'g2p_en',
'g2pM', 'g2pM',
'yacs', 'yacs',
'tensorboardX', 'tensorboardX',
], ],
extras_require={ extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
},
# Package info # Package info
packages=find_packages(exclude=('tests', 'tests.*')), packages=find_packages(exclude=('tests', 'tests.*')),
zip_safe=True, zip_safe=True,
classifiers=[
classifiers = [
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Intended Audience :: Developers', 'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Artificial Intelligence' 'Topic :: Scientific/Engineering :: Artificial Intelligence'
'License :: OSI Approved :: Apache2 License', 'License :: OSI Approved :: Apache2 License',
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.7',
], ], )
)
setup(**setup_info) setup(**setup_info)

View File

@ -1,101 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import attention as attn
class TestScaledDotProductAttention(unittest.TestCase):
def test_without_mask(self):
x = paddle.randn([4, 16, 8])
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
assert(list(context_vector.shape) == [4, 16, 8])
assert(list(attention_weights.shape) == [4, 16, 16])
def test_with_mask(self):
x = paddle.randn([4, 16, 8])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([16, 15, 13, 14]), dtype=x.dtype)
mask = mask.unsqueeze(1) # unsqueeze for the decoder time steps
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x, mask)
assert(list(context_vector.shape) == [4, 16, 8])
assert(list(attention_weights.shape) == [4, 16, 16])
def test_4d(self):
x = paddle.randn([4, 6, 16, 8])
context_vector, attention_weights = attn.scaled_dot_product_attention(x, x, x)
assert(list(context_vector.shape) == [4, 6, 16, 8])
assert(list(attention_weights.shape) == [4, 6, 16, 16])
class TestMonoheadAttention(unittest.TestCase):
def test_io(self):
net = attn.MonoheadAttention(6, 0.1)
q = paddle.randn([4, 18, 6])
k = paddle.randn([4, 12, 6])
v = paddle.randn([4, 12, 6])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attn_weights = net(q, k, v, mask)
self.assertTupleEqual(context_vector.numpy().shape, (4, 18, 6))
self.assertTupleEqual(attn_weights.numpy().shape, (4, 18, 12))
class TestDropHead(unittest.TestCase):
def test_drop(self):
x = paddle.randn([4, 6, 16, 8])
out = attn.drop_head(x, 2, training=True)
# drop 2 head from 6 at all positions
np.testing.assert_allclose(np.sum(out.numpy() == 0., axis=1), 2)
def test_drop_all(self):
x = paddle.randn([4, 6, 16, 8])
out = attn.drop_head(x, 6, training=True)
np.testing.assert_allclose(np.sum(out.numpy()), 0)
def test_eval(self):
x = paddle.randn([4, 6, 16, 8])
out = attn.drop_head(x, 6, training=False)
self.assertIs(x, out)
class TestMultiheadAttention(unittest.TestCase):
def __init__(self, methodName="test_io", same_qk=True):
super(TestMultiheadAttention, self).__init__(methodName)
self.same_qk = same_qk
def setUp(self):
if self.same_qk:
net = attn.MultiheadAttention(64, 8, dropout=0.3)
else:
net = attn.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
self.net =net
def test_io(self):
q = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attention_weights = self.net(q, q, q, mask)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestScaledDotProductAttention("test_without_mask"))
suite.addTest(TestScaledDotProductAttention("test_with_mask"))
suite.addTest(TestScaledDotProductAttention("test_4d"))
suite.addTest(TestDropHead("test_drop"))
suite.addTest(TestDropHead("test_drop_all"))
suite.addTest(TestDropHead("test_eval"))
suite.addTest(TestMonoheadAttention("test_io"))
suite.addTest(TestMultiheadAttention("test_io", same_qk=True))
suite.addTest(TestMultiheadAttention("test_io", same_qk=False))
return suite

View File

@ -1,34 +0,0 @@
import unittest
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import cbhg
class TestHighway(unittest.TestCase):
def test_io(self):
net = cbhg.Highway(4)
x = paddle.randn([2, 12, 4])
y = net(x)
self.assertTupleEqual(y.numpy().shape, (2, 12, 4))
class TestCBHG(unittest.TestCase):
def __init__(self, methodName="runTest", ):
super(TestCBHG, self).__init__(methodName)
def test_io(self):
self.net = cbhg.CBHG(64, 32, 16,
projection_channels=[64, 128],
num_highways=4, highway_features=128,
gru_features=64)
x = paddle.randn([4, 64, 32])
y = self.net(x)
self.assertTupleEqual(y.numpy().shape, (4, 32, 128))
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestHighway("test_io"))
suite.addTest(TestCBHG("test_io"))
return suite

View File

@ -1,43 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import clarinet
from parakeet.modules import stft
class TestParallelWaveNet(unittest.TestCase):
def test_io(self):
net = clarinet.ParallelWaveNet([8, 8, 8], [1, 1, 1], 16, 12, 2)
x = paddle.randn([4, 6073])
condition = paddle.randn([4, 12, 6073])
z, out_mu, out_log_std = net(x, condition)
self.assertTupleEqual(z.numpy().shape, (4, 6073))
self.assertTupleEqual(out_mu.numpy().shape, (4, 6073))
self.assertTupleEqual(out_log_std.numpy().shape, (4, 6073))
class TestClariNet(unittest.TestCase):
def setUp(self):
encoder = clarinet.UpsampleNet([2, 2])
teacher = clarinet.WaveNet(8, 3, 16, 3, 12, 2, "mog", -9.0)
student = clarinet.ParallelWaveNet([8, 8, 8, 8, 8, 8], [1, 1, 1, 1, 1, 1], 16, 12, 2)
stft_module = stft.STFT(16, 4, 8)
net = clarinet.Clarinet(encoder, teacher, student, stft_module, -6.0, lmd=4)
print("context size is: ", teacher.context_size)
self.net = net
def test_io(self):
audio = paddle.randn([4, 1366])
mel = paddle.randn([4, 12, 512]) # 512 * 4 =2048
audio_start = paddle.zeros([4], dtype="int64")
loss = self.net(audio, mel, audio_start, clip_kl=True)
loss["loss"].numpy()
def test_synthesis(self):
mel = paddle.randn([4, 12, 512]) # 64 = 246 / 4
out = self.net.synthesis(mel)
self.assertTupleEqual(out.numpy().shape, (4, 2048))

View File

@ -1,33 +0,0 @@
import unittest
import paddle
from paddle import nn
paddle.disable_static(paddle.CPUPlace())
paddle.set_default_dtype("float64")
from parakeet.modules import connections as conn
class TestPreLayerNormWrapper(unittest.TestCase):
def test_io(self):
net = nn.Linear(8, 8)
net = conn.PreLayerNormWrapper(net, 8)
x = paddle.randn([4, 8])
y = net(x)
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
class TestPostLayerNormWrapper(unittest.TestCase):
def test_io(self):
net = nn.Linear(8, 8)
net = conn.PostLayerNormWrapper(net, 8)
x = paddle.randn([4, 8])
y = net(x)
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)
class TestResidualWrapper(unittest.TestCase):
def test_io(self):
net = nn.Linear(8, 8)
net = conn.ResidualWrapper(net)
x = paddle.randn([4, 8])
y = net(x)
self.assertTupleEqual(x.numpy().shape, y.numpy().shape)

View File

@ -1,67 +0,0 @@
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
import unittest
import numpy as np
from parakeet.modules import conv
class TestConv1dCell(unittest.TestCase):
def setUp(self):
self.net = conv.Conv1dCell(4, 6, 5, dilation=2)
def forward_incremental(self, x):
outs = []
self.net.start_sequence()
with paddle.no_grad():
for i in range(x.shape[-1]):
xt = x[:, :, i]
yt = self.net.add_input(xt)
outs.append(yt)
y2 = paddle.stack(outs, axis=-1)
return y2
def test_equality(self):
x = paddle.randn([2, 4, 16])
y1 = self.net(x)
self.net.eval()
y2 = self.forward_incremental(x)
np.testing.assert_allclose(y2.numpy(), y1.numpy())
class TestConv1dBatchNorm(unittest.TestCase):
def __init__(self, methodName="runTest", causal=False, channel_last=False):
super(TestConv1dBatchNorm, self).__init__(methodName)
self.causal = causal
self.channel_last = channel_last
def setUp(self):
k = 5
paddding = (k - 1, 0) if self.causal else ((k-1) // 2, k //2)
self.net = conv.Conv1dBatchNorm(4, 6, (k,), 1, padding=paddding,
data_format="NLC" if self.channel_last else "NCL")
def test_input_output(self):
x = paddle.randn([4, 16, 4]) if self.channel_last else paddle.randn([4, 4, 16])
out = self.net(x)
out_np = out.numpy()
if self.channel_last:
self.assertTupleEqual(out_np.shape, (4, 16, 6))
else:
self.assertTupleEqual(out_np.shape, (4, 6, 16))
def runTest(self):
self.test_input_output()
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestConv1dBatchNorm("runTest", True, True))
suite.addTest(TestConv1dBatchNorm("runTest", False, False))
suite.addTest(TestConv1dBatchNorm("runTest", True, False))
suite.addTest(TestConv1dBatchNorm("runTest", False, True))
suite.addTest(TestConv1dCell("test_equality"))
return suite

View File

@ -1,122 +0,0 @@
import unittest
import numpy as np
import paddle
from paddle import io
from parakeet import data
class MyDataset(io.Dataset):
def __init__(self, size):
self._data = np.random.randn(size, 6)
def __getitem__(self, i):
return self._data[i]
def __len__(self):
return self._data.shape[0]
class TestTransformDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(20)
dataset = data.TransformDataset(dataset, lambda x: np.abs(x))
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("TransformDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestChainDataset(unittest.TestCase):
def test(self):
dataset1 = MyDataset(20)
dataset2 = MyDataset(40)
dataset = data.ChainDataset(dataset1, dataset2)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("ChainDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestTupleDataset(unittest.TestCase):
def test(self):
dataset1 = MyDataset(20)
dataset2 = MyDataset(20)
dataset = data.TupleDataset(dataset1, dataset2)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("TupleDataset")
for field1, field2 in dataloader:
print(type(field1), field1.dtype, field1.shape)
print(type(field2), field2.dtype, field2.shape)
class TestDictDataset(unittest.TestCase):
def test(self):
dataset1 = MyDataset(20)
dataset2 = MyDataset(20)
dataset = data.DictDataset(field1=dataset1, field2=dataset2)
def collate_fn(examples):
examples_tuples = []
for example in examples:
examples_tuples.append(example.values())
return paddle.fluid.dataloader.dataloader_iter.default_collate_fn(examples_tuples)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1, collate_fn=collate_fn)
print("DictDataset")
for field1, field2 in dataloader:
print(type(field1), field1.dtype, field1.shape)
print(type(field2), field2.dtype, field2.shape)
class TestSliceDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
dataset = data.SliceDataset(dataset, 0, 20)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("SliceDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestSplit(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
train, valid = data.split(dataset, 10)
dataloader1 = io.DataLoader(train, batch_size=4, shuffle=True, num_workers=1)
dataloader2 = io.DataLoader(valid, batch_size=4, shuffle=True, num_workers=1)
print("First Dataset")
for batch, in dataloader1:
print(type(batch), batch.dtype, batch.shape)
print("Second Dataset")
for batch, in dataloader2:
print(type(batch), batch.dtype, batch.shape)
class TestSubsetDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
indices = np.random.choice(np.arange(40), [20], replace=False).tolist()
dataset = data.SubsetDataset(dataset, indices)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("SubsetDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestFilterDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
dataset = data.FilterDataset(dataset, lambda x: np.mean(x)> 0.3)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("FilterDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)
class TestCacheDataset(unittest.TestCase):
def test(self):
dataset = MyDataset(40)
dataset = data.CacheDataset(dataset)
dataloader = io.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
print("CacheDataset")
for batch, in dataloader:
print(type(batch), batch.dtype, batch.shape)

View File

@ -1,107 +0,0 @@
import numpy as np
import unittest
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import deepvoice3 as dv3
class TestConvBlock(unittest.TestCase):
def test_io_causal(self):
net = dv3.ConvBlock(6, 5, True, True, 8, 0.9)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
# TODO(chenfeiyu): to report an issue on default data type
padding = paddle.zeros([4, 4, 6], dtype=x.dtype)
y = net.forward(x, condition, padding)
self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
def test_io_non_causal(self):
net = dv3.ConvBlock(6, 5, False, True, 8, 0.9)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
y = net.forward(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32, 6))
class TestAffineBlock1(unittest.TestCase):
def test_io(self):
net = dv3.AffineBlock1(6, 16, True, 8)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
class TestAffineBlock2(unittest.TestCase):
def test_io(self):
net = dv3.AffineBlock2(6, 16, True, 8)
x = paddle.randn([4, 32, 6])
condition = paddle.randn([4, 8])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32, 16))
class TestEncoder(unittest.TestCase):
def test_io(self):
net = dv3.Encoder(5, 8, 16, 5, True, 6)
x = paddle.randn([4, 32, 8])
condition = paddle.randn([4, 6])
keys, values = net(x, condition)
self.assertTupleEqual(keys.numpy().shape, (4, 32, 8))
self.assertTupleEqual(values.numpy().shape, (4, 32, 8))
class TestAttentionBlock(unittest.TestCase):
def test_io(self):
net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
q = paddle.randn([4, 32, 6])
k = paddle.randn([4, 24, 6])
v = paddle.randn([4, 24, 6])
lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
condition = paddle.randn([4, 8])
context_vector, attention_weight = net(q, k, v, lengths, condition, 0)
self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
def test_io_with_previous_attn(self):
net = dv3.AttentionBlock(16, 6, has_bias=True, bias_dim=8)
q = paddle.randn([4, 32, 6])
k = paddle.randn([4, 24, 6])
v = paddle.randn([4, 24, 6])
lengths = paddle.to_tensor([24, 20, 19, 23], dtype="int64")
condition = paddle.randn([4, 8])
prev_attn_weight = paddle.randn([4, 32, 16])
context_vector, attention_weight = net(
q, k, v, lengths, condition, 0,
force_monotonic=True, prev_coeffs=prev_attn_weight, window=(0, 4))
self.assertTupleEqual(context_vector.numpy().shape, (4, 32, 6))
self.assertTupleEqual(attention_weight.numpy().shape, (4, 32, 24))
class TestDecoder(unittest.TestCase):
def test_io(self):
net = dv3.Decoder(8, 4, [4, 12], 5, 3, 16, 1.0, 1.45, True, 6)
x = paddle.randn([4, 32, 8])
k = paddle.randn([4, 24, 12]) # prenet's last size should equals k's feature size
v = paddle.randn([4, 24, 12])
lengths = paddle.to_tensor([24, 18, 19, 22])
condition = paddle.randn([4, 6])
decoded, hidden, attentions, final_state = net(x, k, v, lengths, 0, condition)
self.assertTupleEqual(decoded.numpy().shape, (4, 32, 4 * 8))
self.assertTupleEqual(hidden.numpy().shape, (4, 32, 12))
self.assertEqual(len(attentions), 5)
self.assertTupleEqual(attentions[0].numpy().shape, (4, 32, 24))
self.assertEqual(len(final_state), 5)
self.assertTupleEqual(final_state[0].numpy().shape, (4, 2, 12))
class TestPostNet(unittest.TestCase):
def test_io(self):
net = dv3.PostNet(3, 8, 16, 3, 12, 4, True, 6)
x = paddle.randn([4, 32, 8])
condition = paddle.randn([4, 6])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 32 * 4, 12))

View File

@ -1,19 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import geometry as geo
class TestShuffleDim(unittest.TestCase):
def test_perm(self):
x = paddle.randn([2, 3, 4, 6])
y = geo.shuffle_dim(x, 2, [3, 2, 1, 0])
np.testing.assert_allclose(x.numpy()[0, 0, :, 0], y.numpy()[0, 0, ::-1, 0])
def test_random_perm(self):
x = paddle.randn([2, 3, 4, 6])
y = geo.shuffle_dim(x, 2)
np.testing.assert_allclose(x.numpy().sum(2), y.numpy().sum(2))

View File

@ -1,33 +0,0 @@
import unittest
import paddle
paddle.set_device("cpu")
import numpy as np
from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy
class TestWeightedMean(unittest.TestCase):
def test(self):
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
loss = weighted_mean(x, mask)
self.assertAlmostEqual(loss.numpy()[0], 7)
class TestMaskedL1Loss(unittest.TestCase):
def test(self):
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
y = paddle.zeros_like(x)
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
loss = masked_l1_loss(x, y, mask)
print(loss)
self.assertAlmostEqual(loss.numpy()[0], 7)
class TestMaskedCrossEntropy(unittest.TestCase):
def test(self):
x = paddle.randn([3, 30, 8], dtype="float64")
y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1)
loss = masked_softmax_with_cross_entropy(x, y, mask)
print(loss)

View File

@ -1,54 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
from parakeet.modules import masking
def sequence_mask(lengths, max_length=None, dtype="bool"):
max_length = max_length or np.max(lengths)
ids = np.arange(max_length)
return (ids < np.expand_dims(lengths, -1)).astype(dtype)
def future_mask(lengths, max_length=None, dtype="bool"):
max_length = max_length or np.max(lengths)
return np.tril(np.tril(np.ones(max_length))).astype(dtype)
class TestIDMask(unittest.TestCase):
def test(self):
ids = paddle.to_tensor(
[[1, 2, 3, 0, 0, 0],
[2, 4, 5, 6, 0, 0],
[7, 8, 9, 0, 0, 0]]
)
mask = masking.id_mask(ids)
self.assertTupleEqual(mask.numpy().shape, ids.numpy().shape)
print(mask.numpy())
class TestFeatureMask(unittest.TestCase):
def test(self):
features = np.random.randn(3, 16, 8)
lengths = [16, 14, 12]
for i, length in enumerate(lengths):
features[i, length:, :] = 0
feature_tensor = paddle.to_tensor(features)
mask = masking.feature_mask(feature_tensor, -1)
self.assertTupleEqual(mask.numpy().shape, (3, 16, 1))
print(mask.numpy().squeeze())
class TestCombineMask(unittest.TestCase):
def test_bool_mask(self):
lengths = np.array([12, 8, 9, 10])
padding_mask = sequence_mask(lengths, dtype="bool")
no_future_mask = future_mask(lengths, dtype="bool")
combined_mask1 = np.expand_dims(padding_mask, 1) * no_future_mask
print(paddle.to_tensor(padding_mask).dtype)
print(paddle.to_tensor(no_future_mask).dtype)
combined_mask2 = masking.combine_mask(
paddle.to_tensor(padding_mask).unsqueeze(1), paddle.to_tensor(no_future_mask)
)
np.testing.assert_allclose(combined_mask2.numpy(), combined_mask1)

View File

@ -1,64 +0,0 @@
import unittest
import numpy as np
import paddle
from parakeet.modules import positional_encoding as pe
def positional_encoding(start_index, length, size, dtype="float32"):
if (size % 2 != 0):
raise ValueError("size should be divisible by 2")
channel = np.arange(0, size, 2, dtype=dtype)
index = np.arange(start_index, start_index + length, 1, dtype=dtype)
p = np.expand_dims(index, -1) / (10000 ** (channel / float(size)))
encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
return encodings
def scalable_positional_encoding(start_index, length, size, omega):
dtype = omega.dtype
index = np.arange(start_index, start_index + length, 1, dtype=dtype)
channel = np.arange(0, size, 2, dtype=dtype)
p = np.reshape(omega, omega.shape + (1, 1)) \
* np.expand_dims(index, -1) \
/ (10000 ** (channel / float(size)))
encodings = np.concatenate([np.sin(p), np.cos(p)], axis=-1)
return encodings
class TestPositionEncoding(unittest.TestCase):
def __init__(self, start=0, length=20, size=16, dtype="float64"):
super(TestPositionEncoding, self).__init__("runTest")
self.spec = (start, length, size, dtype)
def test_equality(self):
start, length, size, dtype = self.spec
position_embed1 = positional_encoding(start, length, size, dtype)
position_embed2 = pe.positional_encoding(start, length, size, dtype)
np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
def runTest(self):
paddle.disable_static(paddle.CPUPlace())
self.test_equality()
class TestScalablePositionEncoding(unittest.TestCase):
def __init__(self, start=0, length=20, size=16, dtype="float64"):
super(TestScalablePositionEncoding, self).__init__("runTest")
self.spec = (start, length, size, dtype)
def test_equality(self):
start, length, size, dtype = self.spec
omega = np.random.uniform(1, 2, size=(4,)).astype(dtype)
position_embed1 = scalable_positional_encoding(start, length, size, omega)
position_embed2 = pe.scalable_positional_encoding(start, length, size, paddle.to_tensor(omega))
np.testing.assert_allclose(position_embed2.numpy(), position_embed1)
def runTest(self):
paddle.disable_static(paddle.CPUPlace())
self.test_equality()
def load_tests(loader, standard_tests, pattern):
suite = unittest.TestSuite()
suite.addTest(TestPositionEncoding(0, 20, 16, "float64"))
suite.addTest(TestScalablePositionEncoding(0, 20, 16))
return suite

View File

@ -1,27 +0,0 @@
import unittest
import numpy as np
import librosa
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import stft
class TestSTFT(unittest.TestCase):
def test(self):
path = librosa.util.example("choice")
wav, sr = librosa.load(path, duration=5)
wav = wav.astype("float64")
spec = librosa.stft(wav, n_fft=2048, hop_length=256, win_length=1024)
mag1 = np.abs(spec)
wav_in_batch = paddle.unsqueeze(paddle.to_tensor(wav), 0)
mag2 = stft.STFT(2048, 256, 1024).magnitude(wav_in_batch)
mag2 = paddle.squeeze(mag2, [0, 2]).numpy()
print("mag1", mag1)
print("mag2", mag2)
# TODO(chenfeiyu): Is there something wrong? there is some elements that
# does not match
# np.testing.assert_allclose(mag2, mag1)

View File

@ -1,43 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.modules import transformer
class TestPositionwiseFFN(unittest.TestCase):
def test_io(self):
net = transformer.PositionwiseFFN(8, 12)
x = paddle.randn([2, 3, 4, 8])
y = net(x)
self.assertTupleEqual(y.numpy().shape, (2, 3, 4, 8))
class TestTransformerEncoderLayer(unittest.TestCase):
def test_io(self):
net = transformer.TransformerEncoderLayer(64, 8, 128, 0.5)
x = paddle.randn([4, 12, 64])
lengths = paddle.to_tensor([12, 8, 9, 10])
mask = paddle.fluid.layers.sequence_mask(lengths, dtype=x.dtype)
y, attn_weights = net(x, mask)
self.assertTupleEqual(y.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attn_weights.numpy().shape, (4, 8, 12, 12))
class TestTransformerDecoderLayer(unittest.TestCase):
def test_io(self):
net = transformer.TransformerDecoderLayer(64, 8, 128, 0.5)
q = paddle.randn([4, 32, 64])
k = paddle.randn([4, 24, 64])
v = paddle.randn([4, 24, 64])
enc_lengths = paddle.to_tensor([24, 18, 20, 22])
dec_lengths = paddle.to_tensor([32, 28, 30, 31])
enc_mask = paddle.fluid.layers.sequence_mask(enc_lengths, dtype=k.dtype)
dec_mask = paddle.fluid.layers.sequence_mask(dec_lengths, dtype=q.dtype)
y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))

View File

@ -1,121 +0,0 @@
import unittest
import numpy as np
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import transformer_tts as tts
from parakeet.modules import masking
from pprint import pprint
class TestMultiheadAttention(unittest.TestCase):
def test_io_same_qk(self):
net = tts.MultiheadAttention(64, 8)
q = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
def test_io(self):
net = tts.MultiheadAttention(64, 8, k_dim=12, v_dim=6)
q = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=q.dtype)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for time_steps_q
context_vector, attention_weights = net(q, q, q, mask, drop_n_heads=2)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
class TestTransformerEncoderLayer(unittest.TestCase):
def test_io(self):
net = tts.TransformerEncoderLayer(64, 8, 128)
x = paddle.randn([4, 12, 64])
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([12, 10, 8, 9]), dtype=x.dtype)
context_vector, attention_weights = net(x, mask)
self.assertTupleEqual(context_vector.numpy().shape, (4, 12, 64))
self.assertTupleEqual(attention_weights.numpy().shape, (4, 8, 12, 12))
class TestTransformerDecoderLayer(unittest.TestCase):
def test_io(self):
net = tts.TransformerDecoderLayer(64, 8, 128, 0.5)
q = paddle.randn([4, 32, 64])
k = paddle.randn([4, 24, 64])
v = paddle.randn([4, 24, 64])
enc_lengths = paddle.to_tensor([24, 18, 20, 22])
dec_lengths = paddle.to_tensor([32, 28, 30, 31])
enc_mask = masking.sequence_mask(enc_lengths, dtype=k.dtype)
dec_padding_mask = masking.sequence_mask(dec_lengths, dtype=q.dtype)
no_future_mask = masking.future_mask(32, dtype=q.dtype)
dec_mask = masking.combine_mask(dec_padding_mask.unsqueeze(-1), no_future_mask)
y, self_attn_weights, cross_attn_weights = net(q, k, v, enc_mask, dec_mask)
self.assertTupleEqual(y.numpy().shape, (4, 32, 64))
self.assertTupleEqual(self_attn_weights.numpy().shape, (4, 8, 32, 32))
self.assertTupleEqual(cross_attn_weights.numpy().shape, (4, 8, 32, 24))
class TestTransformerTTS(unittest.TestCase):
def setUp(self):
net = tts.TransformerTTS(
128, 0, 64, 128, 80, 4, 128,
6, 6, 128, 128, 4,
3, 10, 0.1)
self.net = net
def test_encode_io(self):
net = self.net
text = paddle.randint(0, 128, [4, 176])
lengths = paddle.to_tensor([176, 156, 174, 168])
mask = masking.sequence_mask(lengths, dtype=text.dtype)
text = text * mask
encoded, attention_weights, encoder_mask = net.encode(text)
print("output shapes:")
print("encoded:", encoded.numpy().shape)
print("encoder_attentions:", [item.shape for item in attention_weights])
print("encoder_mask:", encoder_mask.numpy().shape)
def test_all_io(self):
net = self.net
text = paddle.randint(0, 128, [4, 176])
lengths = paddle.to_tensor([176, 156, 174, 168])
mask = masking.sequence_mask(lengths, dtype=text.dtype)
text = text * mask
mel = paddle.randn([4, 189, 80])
frames = paddle.to_tensor([189, 186, 179, 174])
mask = masking.sequence_mask(frames, dtype=frames.dtype)
mel = mel * mask.unsqueeze(-1)
encoded, encoder_attention_weights, encoder_mask = net.encode(text)
mel_output, mel_intermediate, cross_attention_weights, stop_logits = net.decode(encoded, mel, encoder_mask)
print("output shapes:")
print("encoder_output:", encoded.numpy().shape)
print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
print("encoder_mask:", encoder_mask.numpy().shape)
print("mel_output: ", mel_output.numpy().shape)
print("mel_intermediate: ", mel_intermediate.numpy().shape)
print("decoder_attentions:", [item.shape for item in cross_attention_weights])
print("stop_logits:", stop_logits.numpy().shape)
def test_predict_io(self):
net = self.net
net.eval()
with paddle.no_grad():
text = paddle.randint(0, 128, [176])
decoder_output, encoder_attention_weights, cross_attention_weights = net.predict(text)
print("output shapes:")
print("mel_output: ", decoder_output.numpy().shape)
print("encoder_attentions:", [item.shape for item in encoder_attention_weights])
print("decoder_attentions:", [item.shape for item in cross_attention_weights])

View File

@ -1,130 +0,0 @@
import numpy as np
import unittest
import paddle
paddle.set_default_dtype("float64")
paddle.disable_static(paddle.CPUPlace())
from parakeet.models import waveflow
class TestFold(unittest.TestCase):
def test_audio(self):
x = paddle.randn([4, 32 * 8])
y = waveflow.fold(x, 8)
self.assertTupleEqual(y.numpy().shape, (4, 32, 8))
def test_spec(self):
x = paddle.randn([4, 80, 32 * 8])
y = waveflow.fold(x, 8)
self.assertTupleEqual(y.numpy().shape, (4, 80, 32, 8))
class TestUpsampleNet(unittest.TestCase):
def test_io(self):
net = waveflow.UpsampleNet([2, 2])
x = paddle.randn([4, 8, 6])
y = net(x)
self.assertTupleEqual(y.numpy().shape, (4, 8, 2 * 2 * 6))
class TestResidualBlock(unittest.TestCase):
def test_io(self):
net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
x = paddle.randn([4, 4, 16, 32])
condition = paddle.randn([4, 6, 16, 32])
res, skip = net(x, condition)
self.assertTupleEqual(res.numpy().shape, (4, 4, 16, 32))
self.assertTupleEqual(skip.numpy().shape, (4, 4, 16, 32))
def test_add_input(self):
net = waveflow.ResidualBlock(4, 6, (3, 3), (2, 2))
net.eval()
net.start_sequence()
x_row = paddle.randn([4, 4, 1, 32])
condition_row = paddle.randn([4, 6, 1, 32])
res, skip = net.add_input(x_row, condition_row)
self.assertTupleEqual(res.numpy().shape, (4, 4, 1, 32))
self.assertTupleEqual(skip.numpy().shape, (4, 4, 1, 32))
class TestResidualNet(unittest.TestCase):
def test_io(self):
net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
x = paddle.randn([4, 6, 8, 32])
condition = paddle.randn([4, 8, 8, 32])
y = net(x, condition)
self.assertTupleEqual(y.numpy().shape, (4, 6, 8, 32))
def test_add_input(self):
net = waveflow.ResidualNet(8, 6, 8, (3, 3), [1, 1, 1, 1, 1, 1, 1, 1])
net.eval()
net.start_sequence()
x_row = paddle.randn([4, 6, 1, 32])
condition_row = paddle.randn([4, 8, 1, 32])
y_row = net.add_input(x_row, condition_row)
self.assertTupleEqual(y_row.numpy().shape, (4, 6, 1, 32))
class TestFlow(unittest.TestCase):
def test_io(self):
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
x = paddle.randn([4, 1, 8, 32])
condition = paddle.randn([4, 7, 8, 32])
z, (logs, b) = net(x, condition)
self.assertTupleEqual(z.numpy().shape, (4, 1, 8, 32))
self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
def test_inverse_row(self):
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
net.eval()
net._start_sequence()
x_row = paddle.randn([4, 1, 1, 32]) # last row
condition_row = paddle.randn([4, 7, 1, 32])
z_row = paddle.randn([4, 1, 1, 32])
x_next_row, (logs, b) = net._inverse_row(z_row, x_row, condition_row)
self.assertTupleEqual(x_next_row.numpy().shape, (4, 1, 1, 32))
self.assertTupleEqual(logs.numpy().shape, (4, 1, 1, 32))
self.assertTupleEqual(b.numpy().shape, (4, 1, 1, 32))
def test_inverse(self):
net = waveflow.Flow(8, 16, 7, (3, 3), 8)
net.eval()
z = paddle.randn([4, 1, 8, 32])
condition = paddle.randn([4, 7, 8, 32])
with paddle.no_grad():
x, (logs, b) = net.inverse(z, condition)
self.assertTupleEqual(x.numpy().shape, (4, 1, 8, 32))
self.assertTupleEqual(logs.numpy().shape, (4, 1, 7, 32))
self.assertTupleEqual(b.numpy().shape, (4, 1, 7, 32))
class TestWaveFlow(unittest.TestCase):
def test_io(self):
x = paddle.randn([4, 32 * 8 ])
condition = paddle.randn([4, 7, 32 * 8])
net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
z, logs_det_jacobian = net(x, condition)
self.assertTupleEqual(z.numpy().shape, (4, 32 * 8))
self.assertTupleEqual(logs_det_jacobian.numpy().shape, (1,))
def test_inverse(self):
z = paddle.randn([4, 32 * 8 ])
condition = paddle.randn([4, 7, 32 * 8])
net = waveflow.WaveFlow(2, 8, 8, 16, 7, (3, 3))
net.eval()
with paddle.no_grad():
x = net.inverse(z, condition)
self.assertTupleEqual(x.numpy().shape, (4, 32 * 8))