format all the code with yapf
This commit is contained in:
parent
c866bb0b57
commit
e03e96d9e4
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
|
@ -14,7 +28,6 @@
|
|||
# import sys
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'parakeet'
|
||||
|
@ -24,7 +37,6 @@ author = 'parakeet-developers'
|
|||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.2'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
|
@ -46,7 +58,6 @@ templates_path = ['_templates']
|
|||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
|
|
|
@ -102,11 +102,3 @@ optional arguments:
|
|||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -72,5 +72,3 @@ Dataset --(transform)--> Dataset --+
|
|||
```
|
||||
|
||||
在这个软件源中包含了几个例子,可以在 [Parakeet/examples](../examples) 中查看。这些实验被作为样例提供给用户,可以直接运行。同时也欢迎用户添加新的模型和实验并为 `Parakeet` 贡献代码。
|
||||
|
||||
|
||||
|
|
|
@ -9,10 +9,3 @@ Parakeet 为用户和开发者提供了
|
|||
1. 可复用的模型以及常用的模块;
|
||||
2. 从数据处理,模型训练到预测等一系列过程的完整实验;
|
||||
3. 高质量的开箱即用模型。
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
|
@ -14,8 +28,7 @@ _C.data = CN(
|
|||
padding_idx=0, # text embedding's padding index
|
||||
mel_start_value=0.5, # value for starting frame
|
||||
mel_end_value=-0.5, # # value for ending frame
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
|
@ -33,8 +46,7 @@ _C.model = CN(
|
|||
dropout=0.1, # global droput probability
|
||||
stop_loss_scale=8.0, # scaler for stop _loss
|
||||
decoder_prenet_dropout=0.5, # decoder prenet dropout probability
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
|
@ -45,8 +57,8 @@ _C.training = CN(
|
|||
valid_interval=1000, # validation
|
||||
save_interval=10000, # checkpoint
|
||||
max_iteration=900000, # max iteration to train
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
@ -7,8 +21,10 @@ from paddle.io import Dataset, DataLoader
|
|||
from parakeet.data.batch import batch_spec, batch_text_id
|
||||
from parakeet.data import dataset
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
|
@ -38,8 +54,8 @@ class Transform(object):
|
|||
ids, mel = example # ids already have <s> and </s>
|
||||
ids = np.array(ids, dtype=np.int64)
|
||||
# add start and end frame
|
||||
mel = np.pad(mel,
|
||||
[(0, 0), (1, 1)],
|
||||
mel = np.pad(
|
||||
mel, [(0, 0), (1, 1)],
|
||||
mode='constant',
|
||||
constant_values=[(0, 0), (self.start_value, self.end_value)])
|
||||
stop_labels = np.ones([mel.shape[1]], dtype=np.int64)
|
||||
|
@ -50,6 +66,7 @@ class Transform(object):
|
|||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0.):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
|
@ -67,7 +84,8 @@ class LJSpeechCollector(object):
|
|||
|
||||
def create_dataloader(config, source_path):
|
||||
lj = LJSpeech(source_path)
|
||||
transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
|
||||
transform = Transform(config.data.mel_start_value,
|
||||
config.data.mel_end_value)
|
||||
lj = dataset.TransformDataset(lj, transform)
|
||||
|
||||
valid_set, train_set = dataset.split(lj, config.data.valid_size)
|
||||
|
@ -85,4 +103,3 @@ def create_dataloader(config, source_path):
|
|||
drop_last=False,
|
||||
collate_fn=data_collator)
|
||||
return train_loader, valid_loader
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import tqdm
|
||||
import pickle
|
||||
|
@ -11,6 +25,7 @@ from parakeet.frontend import English
|
|||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def create_dataset(config, source_path, target_path, verbose=False):
|
||||
# create output dir
|
||||
target_path = Path(target_path).expanduser()
|
||||
|
@ -47,7 +62,8 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.pkl"))
|
||||
|
||||
# also save meta data into text format for inspection
|
||||
with open(target_path / "metadata.txt", 'wt') as f:
|
||||
|
@ -55,20 +71,30 @@ def create_dataset(config, source_path, target_path, verbose=False):
|
|||
phoneme_str = "|".join(phonemes)
|
||||
f.write("{}\t{}\t{}\n".format(mel_name, text, phoneme_str))
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path / "metadata.txt"))
|
||||
print("saved metadata into {}".format(target_path /
|
||||
"metadata.txt"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
||||
parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument("--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument("--opts", nargs=argparse.REMAINDER,
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
@ -13,14 +27,15 @@ from parakeet.utils.display import add_attention_plots
|
|||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
@paddle.fluid.dygraph.no_grad
|
||||
def main(config, args):
|
||||
paddle.set_device(args.device)
|
||||
|
||||
# model
|
||||
frontend = English()
|
||||
model = TransformerTTS.from_pretrained(
|
||||
frontend, config, args.checkpoint_path)
|
||||
model = TransformerTTS.from_pretrained(frontend, config,
|
||||
args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# inputs
|
||||
|
@ -38,19 +53,33 @@ def main(config, args):
|
|||
mel_output = mel_output.T #(C, T)
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir / f"sentence_{i}.npy"))
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
f"sentence_{i}.npy"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
||||
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
@ -19,6 +33,7 @@ from parakeet.training.experiment import ExperimentBase
|
|||
from config import get_cfg_defaults
|
||||
from ljspeech import LJSpeech, LJSpeechCollector, Transform
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
|
@ -46,8 +61,7 @@ class Experiment(ExperimentBase):
|
|||
beta1=0.9,
|
||||
beta2=0.98,
|
||||
epsilon=1e-9,
|
||||
parameters=model.parameters()
|
||||
)
|
||||
parameters=model.parameters())
|
||||
criterion = TransformerTTSLoss(config.model.stop_loss_scale)
|
||||
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
||||
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
||||
|
@ -63,9 +77,12 @@ class Experiment(ExperimentBase):
|
|||
config = self.config
|
||||
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
transform = Transform(config.data.mel_start_value, config.data.mel_end_value)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset, transform)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
|
||||
transform = Transform(config.data.mel_start_value,
|
||||
config.data.mel_end_value)
|
||||
ljspeech_dataset = dataset.TransformDataset(ljspeech_dataset,
|
||||
transform)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
|
@ -115,11 +132,8 @@ class Experiment(ExperimentBase):
|
|||
|
||||
time_steps = mel_target.shape[1]
|
||||
losses = self.criterion(
|
||||
mel_output[:,:time_steps, :],
|
||||
mel_intermediate[:,:time_steps, :],
|
||||
mel_target,
|
||||
stop_logits[:,:time_steps, :],
|
||||
stop_label_target)
|
||||
mel_output[:, :time_steps, :], mel_intermediate[:, :time_steps, :],
|
||||
mel_target, stop_logits[:, :time_steps, :], stop_label_target)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
|
@ -141,13 +155,16 @@ class Experiment(ExperimentBase):
|
|||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items())
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
@ -165,8 +182,7 @@ class Experiment(ExperimentBase):
|
|||
display.add_multi_attention_plots(
|
||||
self.visualizer,
|
||||
f"valid_sentence_{i}_cross_attention_weights",
|
||||
attention_weights,
|
||||
self.iteration)
|
||||
attention_weights, self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
|
@ -12,8 +26,7 @@ _C.data = CN(
|
|||
f_max=8000, # Hz, max frequency when converting to mel
|
||||
n_mels=80, # mel bands
|
||||
clip_frames=65, # mel clip frames
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
|
@ -24,8 +37,7 @@ _C.model = CN(
|
|||
channels=128, # resiaudal channel in each flow
|
||||
kernel_size=[3, 3], # kernel size in each conv block
|
||||
sigma=1.0, # stddev of the random noise
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
|
@ -33,8 +45,8 @@ _C.training = CN(
|
|||
valid_interval=1000, # validation
|
||||
save_interval=10000, # checkpoint
|
||||
max_iteration=3000000, # max iteration to train
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
@ -9,16 +23,17 @@ from parakeet.data.batch import batch_spec, batch_wav
|
|||
from parakeet.data import dataset
|
||||
from parakeet.audio import AudioProcessor
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
meta_data = pandas.read_csv(
|
||||
str(self.root / "metadata.csv"),
|
||||
sep="\t",
|
||||
header=None,
|
||||
names=["fname", "frames", "samples"]
|
||||
)
|
||||
names=["fname", "frames", "samples"])
|
||||
|
||||
records = []
|
||||
for row in meta_data.itertuples():
|
||||
|
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
|
|||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_value=0.):
|
||||
self.padding_value = padding_value
|
||||
|
||||
|
@ -71,8 +87,6 @@ class LJSpeechClipCollector(object):
|
|||
frames = mel.shape[-1]
|
||||
start = np.random.randint(0, frames - self.clip_frames)
|
||||
mel_clip = mel[:, start:start + self.clip_frames]
|
||||
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
|
||||
wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
|
||||
self.hop_length]
|
||||
return mel_clip, wav_clip
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import tqdm
|
||||
import csv
|
||||
|
@ -86,12 +100,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
|||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
transform = Transform(
|
||||
config.sample_rate,
|
||||
config.n_fft,
|
||||
config.win_length,
|
||||
config.hop_length,
|
||||
config.n_mels)
|
||||
transform = Transform(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.n_mels)
|
||||
file_names = []
|
||||
|
||||
for example in tqdm.tqdm(dataset):
|
||||
|
@ -109,20 +119,32 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
|||
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
||||
|
||||
meta_data = pd.DataFrame.from_records(file_names)
|
||||
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
|
||||
meta_data.to_csv(
|
||||
str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||
print("saved meta data in to {}".format(
|
||||
os.path.join(output_dir, "metadata.csv")))
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
||||
parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument("--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument("--opts", nargs=argparse.REMAINDER,
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
@ -8,9 +22,9 @@ import parakeet
|
|||
from parakeet.models.waveflow import UpsampleNet, WaveFlow, ConditionalWaveFlow
|
||||
from parakeet.utils import layer_tools, checkpoint
|
||||
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def main(config, args):
|
||||
paddle.set_device(args.device)
|
||||
model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
|
||||
|
@ -23,7 +37,8 @@ def main(config, args):
|
|||
for file_path in mel_dir.iterdir():
|
||||
mel = np.load(str(file_path))
|
||||
audio = model.predict(mel)
|
||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
|
||||
audio_path = output_dir / (
|
||||
os.path.splitext(file_path.name)[0] + ".wav")
|
||||
sf.write(audio_path, audio, config.data.sample_rate)
|
||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||
|
||||
|
@ -31,14 +46,29 @@ def main(config, args):
|
|||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
||||
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
help="path of directory containing mel spectrogram (in .npy format)")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
@ -34,7 +48,8 @@ class Experiment(ExperimentBase):
|
|||
|
||||
if self.parallel > 1:
|
||||
model = paddle.DataParallel(model)
|
||||
optimizer = paddle.optimizer.Adam(config.training.lr, parameters=model.parameters())
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
config.training.lr, parameters=model.parameters())
|
||||
criterion = WaveFlowLoss(sigma=config.model.sigma)
|
||||
|
||||
self.model = model
|
||||
|
@ -46,9 +61,11 @@ class Experiment(ExperimentBase):
|
|||
args = self.args
|
||||
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
|
||||
batch_fn = LJSpeechClipCollector(config.data.clip_frames, config.data.hop_length)
|
||||
batch_fn = LJSpeechClipCollector(config.data.clip_frames,
|
||||
config.data.hop_length)
|
||||
|
||||
if not self.parallel:
|
||||
train_loader = DataLoader(
|
||||
|
@ -97,10 +114,12 @@ class Experiment(ExperimentBase):
|
|||
loss_value = float(loss)
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += "loss: {:>.6f}".format(loss_value)
|
||||
self.logger.info(msg)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"train/loss", loss_value, global_step=self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
@ -112,7 +131,8 @@ class Experiment(ExperimentBase):
|
|||
loss = self.criterion(z, log_det_jocobian)
|
||||
valid_losses.append(float(loss))
|
||||
valid_loss = np.mean(valid_losses)
|
||||
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"valid/loss", valid_loss, global_step=self.iteration)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
|
@ -12,8 +26,7 @@ _C.data = CN(
|
|||
# f_max=8000, # Hz, max frequency when converting to mel
|
||||
n_mels=80, # mel bands
|
||||
train_clip_seconds=0.5, # audio clip length(in seconds)
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
|
@ -24,9 +37,7 @@ _C.model = CN(
|
|||
residual_channels=128, # resiaudal channel in each flow
|
||||
loss_type="mog",
|
||||
output_dim=3, # single gaussian
|
||||
log_scale_min=-9.0,
|
||||
)
|
||||
)
|
||||
log_scale_min=-9.0, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
|
@ -37,8 +48,8 @@ _C.training = CN(
|
|||
save_interval=10000, # checkpoint
|
||||
max_iteration=3000000, # max iteration to train
|
||||
gradient_max_norm=100.0 # global norm of gradients
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
@ -9,16 +23,17 @@ from parakeet.data.batch import batch_spec, batch_wav
|
|||
from parakeet.data import dataset
|
||||
from parakeet.audio import AudioProcessor
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
meta_data = pandas.read_csv(
|
||||
str(self.root / "metadata.csv"),
|
||||
sep="\t",
|
||||
header=None,
|
||||
names=["fname", "frames", "samples"]
|
||||
)
|
||||
names=["fname", "frames", "samples"])
|
||||
|
||||
records = []
|
||||
for row in meta_data.itertuples():
|
||||
|
@ -39,6 +54,7 @@ class LJSpeech(Dataset):
|
|||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_value=0.):
|
||||
self.padding_value = padding_value
|
||||
|
||||
|
@ -75,7 +91,8 @@ class LJSpeechClipCollector(object):
|
|||
mel, wav = example
|
||||
frames = mel.shape[-1]
|
||||
start = np.random.randint(0, frames - self.clip_frames)
|
||||
wav_clip = wav[start * self.hop_length: (start + self.clip_frames) * self.hop_length]
|
||||
wav_clip = wav[start * self.hop_length:(start + self.clip_frames) *
|
||||
self.hop_length]
|
||||
return mel, wav_clip, start
|
||||
|
||||
|
||||
|
@ -132,7 +149,3 @@ class DataCollector(object):
|
|||
audios = np.array(audios, dtype=np.float32)
|
||||
audio_starts = np.array(audio_starts, dtype=np.int64)
|
||||
return audios, mels, audio_starts
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import tqdm
|
||||
import csv
|
||||
|
@ -87,12 +101,8 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
|||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
transform = Transform(
|
||||
config.sample_rate,
|
||||
config.n_fft,
|
||||
config.win_length,
|
||||
config.hop_length,
|
||||
config.n_mels)
|
||||
transform = Transform(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.n_mels)
|
||||
file_names = []
|
||||
|
||||
for example in tqdm.tqdm(dataset):
|
||||
|
@ -110,20 +120,32 @@ def create_dataset(config, input_dir, output_dir, verbose=True):
|
|||
file_names.append((base_name, mel.shape[-1], audio.shape[-1]))
|
||||
|
||||
meta_data = pd.DataFrame.from_records(file_names)
|
||||
meta_data.to_csv(str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||
print("saved meta data in to {}".format(os.path.join(output_dir, "metadata.csv")))
|
||||
meta_data.to_csv(
|
||||
str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
|
||||
print("saved meta data in to {}".format(
|
||||
os.path.join(output_dir, "metadata.csv")))
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
||||
parser.add_argument("--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument("--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument("--opts", nargs=argparse.REMAINDER,
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
@ -10,6 +24,7 @@ from parakeet.utils import layer_tools, checkpoint
|
|||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def main(config, args):
|
||||
paddle.set_device(args.device)
|
||||
model = ConditionalWaveNet.from_pretrained(config, args.checkpoint_path)
|
||||
|
@ -22,7 +37,8 @@ def main(config, args):
|
|||
for file_path in mel_dir.iterdir():
|
||||
mel = np.load(str(file_path))
|
||||
audio = model.predict(mel)
|
||||
audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
|
||||
audio_path = output_dir / (
|
||||
os.path.splitext(file_path.name)[0] + ".wav")
|
||||
sf.write(audio_path, audio, config.data.sample_rate)
|
||||
print("[synthesize] {} -> {}".format(file_path, audio_path))
|
||||
|
||||
|
@ -30,14 +46,29 @@ def main(config, args):
|
|||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument("--config", type=str, metavar="FILE", help="extra config to overwrite the default config")
|
||||
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of directory containing mel spectrogram (in .npy format)")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
help="path of directory containing mel spectrogram (in .npy format)")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument("--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="print msg")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cpu", help="device type to use.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
import math
|
||||
|
@ -39,13 +53,13 @@ class Experiment(ExperimentBase):
|
|||
model = paddle.DataParallel(model)
|
||||
|
||||
lr_scheduler = paddle.optimizer.lr.StepDecay(
|
||||
config.training.lr,
|
||||
config.training.anneal_interval,
|
||||
config.training.lr, config.training.anneal_interval,
|
||||
config.training.anneal_rate)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
lr_scheduler,
|
||||
parameters=model.parameters(),
|
||||
grad_clip=paddle.nn.ClipGradByGlobalNorm(config.training.gradient_max_norm))
|
||||
grad_clip=paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.gradient_max_norm))
|
||||
|
||||
self.model = model
|
||||
self.model_core = model._layer if self.parallel else model
|
||||
|
@ -56,7 +70,8 @@ class Experiment(ExperimentBase):
|
|||
args = self.args
|
||||
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset, config.data.valid_size)
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
|
||||
# convolutional net's causal padding size
|
||||
context_size = config.model.n_stack \
|
||||
|
@ -66,7 +81,8 @@ class Experiment(ExperimentBase):
|
|||
|
||||
# frames used to compute loss
|
||||
frames_per_second = config.data.sample_rate // config.data.hop_length
|
||||
train_clip_frames = math.ceil(config.data.train_clip_seconds * frames_per_second)
|
||||
train_clip_frames = math.ceil(config.data.train_clip_seconds *
|
||||
frames_per_second)
|
||||
|
||||
num_frames = train_clip_frames + context_frames
|
||||
batch_fn = LJSpeechClipCollector(num_frames, config.data.hop_length)
|
||||
|
@ -111,10 +127,12 @@ class Experiment(ExperimentBase):
|
|||
loss_value = float(loss)
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, iteration_time)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += "loss: {:>.6f}".format(loss_value)
|
||||
self.logger.info(msg)
|
||||
self.visualizer.add_scalar("train/loss", loss_value, global_step=self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"train/loss", loss_value, global_step=self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
|
@ -126,7 +144,8 @@ class Experiment(ExperimentBase):
|
|||
loss = self.model.loss(y, wav)
|
||||
valid_losses.append(float(loss))
|
||||
valid_loss = np.mean(valid_losses)
|
||||
self.visualizer.add_scalar("valid/loss", valid_loss, global_step=self.iteration)
|
||||
self.visualizer.add_scalar(
|
||||
"valid/loss", valid_loss, global_step=self.iteration)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
|
|
|
@ -18,6 +18,7 @@ import numpy as np
|
|||
|
||||
__all__ = ["AudioProcessor"]
|
||||
|
||||
|
||||
class AudioProcessor(object):
|
||||
def __init__(self,
|
||||
sample_rate: int,
|
||||
|
@ -50,8 +51,7 @@ class AudioProcessor(object):
|
|||
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
|
||||
|
||||
def _create_mel_filter(self):
|
||||
mel_filter = librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
mel_filter = librosa.filters.mel(self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.f_min,
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This modules contains normalizers for spectrogram magnitude.
|
||||
|
@ -23,10 +36,12 @@ class NormalizerBase(object):
|
|||
def inverse(self, normalized):
|
||||
raise NotImplementedError("inverse must be implemented")
|
||||
|
||||
|
||||
class LogMagnitude(NormalizerBase):
|
||||
"""
|
||||
This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
|
||||
"""
|
||||
|
||||
def __init__(self, min=1e-7):
|
||||
self.min = min
|
||||
|
||||
|
@ -44,6 +59,7 @@ class UnitMagnitude(NormalizerBase):
|
|||
"""
|
||||
This is the normalizer used in the
|
||||
"""
|
||||
|
||||
def __init__(self, min=1e-5):
|
||||
self.min = min
|
||||
|
||||
|
|
|
@ -18,10 +18,15 @@ Batch functions for text sequences, audio and spectrograms are provided.
|
|||
import numpy as np
|
||||
|
||||
__all__ = [
|
||||
"batch_text_id", "batch_wav", "batch_spec",
|
||||
"TextIDBatcher", "WavBatcher", "SpecBatcher",
|
||||
"batch_text_id",
|
||||
"batch_wav",
|
||||
"batch_spec",
|
||||
"TextIDBatcher",
|
||||
"WavBatcher",
|
||||
"SpecBatcher",
|
||||
]
|
||||
|
||||
|
||||
class TextIDBatcher(object):
|
||||
"""A wrapper class for `batch_text_id`."""
|
||||
|
||||
|
@ -113,7 +118,11 @@ class SpecBatcher(object):
|
|||
self.time_major = time_major
|
||||
|
||||
def __call__(self, minibatch):
|
||||
out = batch_spec(minibatch, pad_value=self.pad_value, time_major=self.time_major, dtype=self.dtype)
|
||||
out = batch_spec(
|
||||
minibatch,
|
||||
pad_value=self.pad_value,
|
||||
time_major=self.time_major,
|
||||
dtype=self.dtype)
|
||||
return out
|
||||
|
||||
|
||||
|
@ -130,7 +139,8 @@ def batch_spec(minibatch, pad_value=0., time_major=False, dtype=np.float32):
|
|||
"""
|
||||
# assume (F, T) or (T, F)
|
||||
peek_example = minibatch[0]
|
||||
assert len(peek_example.shape) == 2, "we only handles mono channel spectrogram"
|
||||
assert len(
|
||||
peek_example.shape) == 2, "we only handles mono channel spectrogram"
|
||||
|
||||
# assume (F, n_frame) or (n_frame, F)
|
||||
time_idx = 0 if time_major else -1
|
||||
|
|
|
@ -17,17 +17,25 @@ import paddle
|
|||
from paddle.io import Dataset
|
||||
|
||||
__all__ = [
|
||||
"split", "TransformDataset", "CacheDataset", "TupleDataset",
|
||||
"DictDataset", "SliceDataset", "SubsetDataset", "FilterDataset",
|
||||
"split",
|
||||
"TransformDataset",
|
||||
"CacheDataset",
|
||||
"TupleDataset",
|
||||
"DictDataset",
|
||||
"SliceDataset",
|
||||
"SubsetDataset",
|
||||
"FilterDataset",
|
||||
"ChainDataset",
|
||||
]
|
||||
|
||||
|
||||
def split(dataset, first_size):
|
||||
"""A utility function to split a dataset into two datasets."""
|
||||
first = SliceDataset(dataset, 0, first_size)
|
||||
second = SliceDataset(dataset, first_size, len(dataset))
|
||||
return first, second
|
||||
|
||||
|
||||
class TransformDataset(Dataset):
|
||||
def __init__(self, dataset, transform):
|
||||
"""Dataset which is transformed from another with a transform.
|
||||
|
|
|
@ -1,2 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.datasets.common import *
|
||||
from parakeet.datasets.ljspeech import *
|
|
@ -1,9 +1,24 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle.io import Dataset
|
||||
import os
|
||||
import librosa
|
||||
|
||||
__all__ = ["AudioFolderDataset"]
|
||||
|
||||
|
||||
class AudioFolderDataset(Dataset):
|
||||
def __init__(self, path, sample_rate, extension="wav"):
|
||||
self.root = os.path.expanduser(path)
|
||||
|
|
|
@ -1,8 +1,23 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle.io import Dataset
|
||||
from pathlib import Path
|
||||
|
||||
__all__ = ["LJSpeechMetaData"]
|
||||
|
||||
|
||||
class LJSpeechMetaData(Dataset):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
|
@ -22,4 +37,3 @@ class LJSpeechMetaData(Dataset):
|
|||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.vocab import *
|
||||
from parakeet.frontend.phonectic import *
|
||||
from parakeet.frontend.punctuation import *
|
||||
|
|
|
@ -1,2 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.frontend.normalizer.normalizer import *
|
||||
from parakeet.frontend.normalizer.numbers import *
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
def full2half_width(ustr):
|
||||
half = []
|
||||
for u in ustr:
|
||||
|
@ -10,6 +24,7 @@ def full2half_width(ustr):
|
|||
half.append(u)
|
||||
return ''.join(half)
|
||||
|
||||
|
||||
def half2full_width(ustr):
|
||||
full = []
|
||||
for u in ustr:
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import string
|
||||
|
||||
|
@ -13,15 +27,8 @@ EN_PUNCT = [
|
|||
"!",
|
||||
]
|
||||
|
||||
CN_PUNCT = [
|
||||
"、",
|
||||
",",
|
||||
";",
|
||||
":",
|
||||
"。",
|
||||
"?",
|
||||
"!"
|
||||
]
|
||||
CN_PUNCT = ["、", ",", ";", ":", "。", "?", "!"]
|
||||
|
||||
|
||||
def get_punctuations(lang):
|
||||
if lang == "en":
|
||||
|
@ -30,4 +37,3 @@ def get_punctuations(lang):
|
|||
return CN_PUNCT
|
||||
else:
|
||||
raise ValueError(f"language {lang} Not supported")
|
||||
|
||||
|
|
|
@ -575,7 +575,8 @@ class TransformerTTS(nn.Layer):
|
|||
decoder_prenet_dropout=config.model.decoder_prenet_dropout,
|
||||
dropout=config.model.dropout)
|
||||
|
||||
iteration = checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
||||
iteration = checkpoint.load_parameters(
|
||||
model, checkpoint_path=checkpoint_path)
|
||||
drop_n_heads = scheduler.StepWise(config.training.drop_n_heads)
|
||||
reduction_factor = scheduler.StepWise(config.training.reduction_factor)
|
||||
model.set_constants(
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
from typing import List, Union, Tuple
|
||||
|
@ -11,6 +25,7 @@ from parakeet.modules import geometry as geo
|
|||
|
||||
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
||||
|
||||
|
||||
def fold(x, n_group):
|
||||
r"""Fold audio or spectrogram's temporal dimension in to groups.
|
||||
|
||||
|
@ -31,6 +46,7 @@ def fold(x, n_group):
|
|||
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
||||
return paddle.reshape(x, new_shape)
|
||||
|
||||
|
||||
class UpsampleNet(nn.LayerList):
|
||||
"""Layer to upsample mel spectrogram to the same temporal resolution with
|
||||
the corresponding waveform.
|
||||
|
@ -60,6 +76,7 @@ class UpsampleNet(nn.LayerList):
|
|||
---------
|
||||
``librosa.core.stft``
|
||||
"""
|
||||
|
||||
def __init__(self, upsample_factors):
|
||||
super(UpsampleNet, self).__init__()
|
||||
for factor in upsample_factors:
|
||||
|
@ -67,7 +84,9 @@ class UpsampleNet(nn.LayerList):
|
|||
init = I.Uniform(-std, std)
|
||||
self.append(
|
||||
nn.utils.weight_norm(
|
||||
nn.Conv2DTranspose(1, 1, (3, 2 * factor),
|
||||
nn.Conv2DTranspose(
|
||||
1,
|
||||
1, (3, 2 * factor),
|
||||
padding=(1, factor // 2),
|
||||
stride=(1, factor),
|
||||
weight_attr=init,
|
||||
|
@ -131,15 +150,21 @@ class ResidualBlock(nn.Layer):
|
|||
dilations : int
|
||||
Dilations of the Convolution2d applied to the input.
|
||||
"""
|
||||
|
||||
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
||||
super(ResidualBlock, self).__init__()
|
||||
# input conv
|
||||
std = math.sqrt(1 / channels * np.prod(kernel_size))
|
||||
init = I.Uniform(-std, std)
|
||||
receptive_field = [1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)]
|
||||
receptive_field = [
|
||||
1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
|
||||
]
|
||||
rh, rw = receptive_field
|
||||
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
|
||||
conv = nn.Conv2D(channels, 2 * channels, kernel_size,
|
||||
conv = nn.Conv2D(
|
||||
channels,
|
||||
2 * channels,
|
||||
kernel_size,
|
||||
padding=paddings,
|
||||
dilation=dilations,
|
||||
weight_attr=init,
|
||||
|
@ -152,15 +177,18 @@ class ResidualBlock(nn.Layer):
|
|||
# condition projection
|
||||
std = math.sqrt(1 / cond_channels)
|
||||
init = I.Uniform(-std, std)
|
||||
condition_proj = nn.Conv2D(cond_channels, 2 * channels, (1, 1),
|
||||
weight_attr=init, bias_attr=init)
|
||||
condition_proj = nn.Conv2D(
|
||||
cond_channels,
|
||||
2 * channels, (1, 1),
|
||||
weight_attr=init,
|
||||
bias_attr=init)
|
||||
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
||||
|
||||
# parametric residual & skip connection
|
||||
std = math.sqrt(1 / channels)
|
||||
init = I.Uniform(-std, std)
|
||||
out_proj = nn.Conv2D(channels, 2 * channels, (1, 1),
|
||||
weight_attr=init, bias_attr=init)
|
||||
out_proj = nn.Conv2D(
|
||||
channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
|
||||
self.out_proj = nn.utils.weight_norm(out_proj)
|
||||
|
||||
def forward(self, x, condition):
|
||||
|
@ -290,6 +318,7 @@ class ResidualNet(nn.LayerList):
|
|||
ValueError
|
||||
If the length of dilations_h does not equals n_layers.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_layer: int,
|
||||
residual_channels: int,
|
||||
|
@ -297,11 +326,13 @@ class ResidualNet(nn.LayerList):
|
|||
kernel_size: Tuple[int],
|
||||
dilations_h: List[int]):
|
||||
if len(dilations_h) != n_layer:
|
||||
raise ValueError("number of dilations_h should equals num of layers")
|
||||
raise ValueError(
|
||||
"number of dilations_h should equals num of layers")
|
||||
super(ResidualNet, self).__init__()
|
||||
for i in range(n_layer):
|
||||
dilation = (dilations_h[i], 2**i)
|
||||
layer = ResidualBlock(residual_channels, condition_channels, kernel_size, dilation)
|
||||
layer = ResidualBlock(residual_channels, condition_channels,
|
||||
kernel_size, dilation)
|
||||
self.append(layer)
|
||||
|
||||
def forward(self, x, condition):
|
||||
|
@ -397,7 +428,9 @@ class Flow(nn.Layer):
|
|||
super(Flow, self).__init__()
|
||||
# input projection
|
||||
self.input_proj = nn.utils.weight_norm(
|
||||
nn.Conv2D(1, channels, (1, 1),
|
||||
nn.Conv2D(
|
||||
1,
|
||||
channels, (1, 1),
|
||||
weight_attr=I.Uniform(-1., 1.),
|
||||
bias_attr=I.Uniform(-1., 1.)))
|
||||
|
||||
|
@ -406,7 +439,9 @@ class Flow(nn.Layer):
|
|||
self.dilations_dict[n_group])
|
||||
|
||||
# output projection
|
||||
self.output_proj = nn.Conv2D(channels, 2, (1, 1),
|
||||
self.output_proj = nn.Conv2D(
|
||||
channels,
|
||||
2, (1, 1),
|
||||
weight_attr=I.Constant(0.),
|
||||
bias_attr=I.Constant(0.))
|
||||
|
||||
|
@ -452,8 +487,8 @@ class Flow(nn.Layer):
|
|||
transformation from x to z.
|
||||
"""
|
||||
# (B, C, H-1, W)
|
||||
logs, b = self._predict_parameters(
|
||||
x[:, :, :-1, :], condition[:, :, 1:, :])
|
||||
logs, b = self._predict_parameters(x[:, :, :-1, :],
|
||||
condition[:, :, 1:, :])
|
||||
z = self._transform(x, logs, b)
|
||||
return z, (logs, b)
|
||||
|
||||
|
@ -514,7 +549,8 @@ class Flow(nn.Layer):
|
|||
z_row = z[:, :, i:i + 1, :]
|
||||
condition_row = condition[:, :, i:i + 1, :]
|
||||
|
||||
x_next_row, (logs, b) = self._inverse_row(z_row, x_row, condition_row)
|
||||
x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
|
||||
condition_row)
|
||||
x.append(x_next_row)
|
||||
logs_list.append(logs)
|
||||
b_list.append(b)
|
||||
|
@ -549,13 +585,17 @@ class WaveFlow(nn.LayerList):
|
|||
kernel_size : Union[int, List[int]]
|
||||
Kernel size of the convolution layer in each ResidualBlock.
|
||||
"""
|
||||
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, kernel_size):
|
||||
|
||||
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
|
||||
kernel_size):
|
||||
if n_group % 2 or n_flows % 2:
|
||||
raise ValueError("number of flows and number of group must be even "
|
||||
raise ValueError(
|
||||
"number of flows and number of group must be even "
|
||||
"since a permutation along group among flows is used.")
|
||||
super(WaveFlow, self).__init__()
|
||||
for _ in range(n_flows):
|
||||
self.append(Flow(n_layers, channels, mel_bands, kernel_size, n_group))
|
||||
self.append(
|
||||
Flow(n_layers, channels, mel_bands, kernel_size, n_group))
|
||||
|
||||
# permutations in h
|
||||
self.perms = self._create_perm(n_group, n_flows)
|
||||
|
@ -572,7 +612,8 @@ class WaveFlow(nn.LayerList):
|
|||
if i < n_flows // 2:
|
||||
perms.append(indices[::-1])
|
||||
else:
|
||||
perm = list(reversed(indices[:half])) + list(reversed(indices[half:]))
|
||||
perm = list(reversed(indices[:half])) + list(
|
||||
reversed(indices[half:]))
|
||||
perms.append(perm)
|
||||
return perms
|
||||
|
||||
|
@ -612,8 +653,10 @@ class WaveFlow(nn.LayerList):
|
|||
x, condition = self._trim(x, condition)
|
||||
|
||||
# to (B, C, h, T//h) layout
|
||||
x = paddle.unsqueeze(paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
||||
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
|
||||
x = paddle.unsqueeze(
|
||||
paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
||||
condition = paddle.transpose(
|
||||
fold(condition, self.n_group), [0, 1, 3, 2])
|
||||
|
||||
# flows
|
||||
logs_list = []
|
||||
|
@ -654,8 +697,10 @@ class WaveFlow(nn.LayerList):
|
|||
|
||||
z, condition = self._trim(z, condition)
|
||||
# to (B, C, h, T//h) layout
|
||||
z = paddle.unsqueeze(paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
||||
condition = paddle.transpose(fold(condition, self.n_group), [0, 1, 3, 2])
|
||||
z = paddle.unsqueeze(
|
||||
paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
||||
condition = paddle.transpose(
|
||||
fold(condition, self.n_group), [0, 1, 3, 2])
|
||||
|
||||
# reverse it flow by flow
|
||||
for i in reversed(range(self.n_flows)):
|
||||
|
@ -695,6 +740,7 @@ class ConditionalWaveFlow(nn.LayerList):
|
|||
kernel_size : Union[int, List[int]]
|
||||
Kernel size of the convolution layer in each ResidualBlock.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
upsample_factors: List[int],
|
||||
n_flows: int,
|
||||
|
@ -795,8 +841,7 @@ class ConditionalWaveFlow(nn.LayerList):
|
|||
ConditionalWaveFlow
|
||||
The model built from pretrained result.
|
||||
"""
|
||||
model = cls(
|
||||
upsample_factors=config.model.upsample_factors,
|
||||
model = cls(upsample_factors=config.model.upsample_factors,
|
||||
n_flows=config.model.n_flows,
|
||||
n_layers=config.model.n_layers,
|
||||
n_group=config.model.n_group,
|
||||
|
@ -816,6 +861,7 @@ class WaveFlowLoss(nn.Layer):
|
|||
The standard deviation of the gaussian noise used in WaveFlow, by
|
||||
default 1.0.
|
||||
"""
|
||||
|
||||
def __init__(self, sigma=1.0):
|
||||
super(WaveFlowLoss, self).__init__()
|
||||
self.sigma = sigma
|
||||
|
@ -839,6 +885,7 @@ class WaveFlowLoss(nn.Layer):
|
|||
Tensor [shape=(1,)]
|
||||
The loss.
|
||||
"""
|
||||
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma) - log_det_jacobian
|
||||
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
|
||||
) - log_det_jacobian
|
||||
loss = loss / np.prod(z.shape)
|
||||
return loss + self.const
|
||||
|
|
|
@ -30,6 +30,7 @@ from parakeet.utils import checkpoint, layer_tools
|
|||
|
||||
__all__ = ["WaveNet", "ConditionalWaveNet"]
|
||||
|
||||
|
||||
def crop(x, audio_start, audio_length):
|
||||
"""Crop the upsampled condition to match audio_length.
|
||||
|
||||
|
@ -96,6 +97,7 @@ class UpsampleNet(nn.LayerList):
|
|||
---------
|
||||
``librosa.core.stft``
|
||||
"""
|
||||
|
||||
def __init__(self, upscale_factors=[16, 16]):
|
||||
super(UpsampleNet, self).__init__()
|
||||
self.upscale_factors = list(upscale_factors)
|
||||
|
@ -106,7 +108,9 @@ class UpsampleNet(nn.LayerList):
|
|||
for factor in self.upscale_factors:
|
||||
self.append(
|
||||
nn.utils.weight_norm(
|
||||
nn.Conv2DTranspose(1, 1,
|
||||
nn.Conv2DTranspose(
|
||||
1,
|
||||
1,
|
||||
kernel_size=(3, 2 * factor),
|
||||
stride=(1, factor),
|
||||
padding=(1, factor // 2))))
|
||||
|
@ -159,6 +163,7 @@ class ResidualBlock(nn.Layer):
|
|||
dilation :int
|
||||
Dilation of the internal convolution cells.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
residual_channels: int,
|
||||
condition_dim: int,
|
||||
|
@ -170,9 +175,11 @@ class ResidualBlock(nn.Layer):
|
|||
# following clarinet's implementation, we do not have parametric residual
|
||||
# & skip connection.
|
||||
|
||||
_filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size
|
||||
_filter_size = filter_size[0] if isinstance(filter_size, (
|
||||
list, tuple)) else filter_size
|
||||
std = math.sqrt(1 / (_filter_size * residual_channels))
|
||||
conv = Conv1dCell(residual_channels,
|
||||
conv = Conv1dCell(
|
||||
residual_channels,
|
||||
dilated_channels,
|
||||
filter_size,
|
||||
dilation=dilation,
|
||||
|
@ -180,7 +187,9 @@ class ResidualBlock(nn.Layer):
|
|||
self.conv = nn.utils.weight_norm(conv)
|
||||
|
||||
std = math.sqrt(1 / condition_dim)
|
||||
condition_proj = Conv1dCell(condition_dim, dilated_channels, (1,),
|
||||
condition_proj = Conv1dCell(
|
||||
condition_dim,
|
||||
dilated_channels, (1, ),
|
||||
weight_attr=I.Normal(scale=std))
|
||||
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
||||
|
||||
|
@ -309,6 +318,7 @@ class ResidualNet(nn.LayerList):
|
|||
Kernel size of the internal ``Conv1dCell`` of each ``ResidualBlock``.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_stack: int,
|
||||
n_loop: int,
|
||||
|
@ -320,7 +330,9 @@ class ResidualNet(nn.LayerList):
|
|||
dilations = [2**i for i in range(n_loop)] * n_stack
|
||||
self.context_size = 1 + sum(dilations)
|
||||
for dilation in dilations:
|
||||
self.append(ResidualBlock(residual_channels, condition_dim, filter_size, dilation))
|
||||
self.append(
|
||||
ResidualBlock(residual_channels, condition_dim, filter_size,
|
||||
dilation))
|
||||
|
||||
def forward(self, x, condition=None):
|
||||
"""Forward pass of ``ResidualNet``.
|
||||
|
@ -426,6 +438,7 @@ class WaveNet(nn.Layer):
|
|||
This is only used for computing loss when ``loss_type`` is "mog", If
|
||||
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
||||
"""
|
||||
|
||||
def __init__(self, n_stack, n_loop, residual_channels, output_dim,
|
||||
condition_dim, filter_size, loss_type, log_scale_min):
|
||||
|
||||
|
@ -437,19 +450,24 @@ class WaveNet(nn.Layer):
|
|||
else:
|
||||
if (output_dim % 3 != 0):
|
||||
raise ValueError(
|
||||
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".format(output_dim))
|
||||
self.embed = nn.utils.weight_norm(nn.Linear(1, residual_channels), dim=1)
|
||||
"with Mixture of Gaussians(mog) output, the output dim must be divisible by 3, but get {}".
|
||||
format(output_dim))
|
||||
self.embed = nn.utils.weight_norm(
|
||||
nn.Linear(1, residual_channels), dim=1)
|
||||
|
||||
self.resnet = ResidualNet(n_stack, n_loop, residual_channels,
|
||||
condition_dim, filter_size)
|
||||
self.context_size = self.resnet.context_size
|
||||
|
||||
skip_channels = residual_channels # assume the same channel
|
||||
self.proj1 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
|
||||
self.proj2 = nn.utils.weight_norm(nn.Linear(skip_channels, skip_channels), dim=1)
|
||||
self.proj1 = nn.utils.weight_norm(
|
||||
nn.Linear(skip_channels, skip_channels), dim=1)
|
||||
self.proj2 = nn.utils.weight_norm(
|
||||
nn.Linear(skip_channels, skip_channels), dim=1)
|
||||
# if loss_type is softmax, output_dim is n_vocab of waveform magnitude.
|
||||
# if loss_type is mog, output_dim is 3 * gaussian, (weight, mean and stddev)
|
||||
self.proj3 = nn.utils.weight_norm(nn.Linear(skip_channels, output_dim), dim=1)
|
||||
self.proj3 = nn.utils.weight_norm(
|
||||
nn.Linear(skip_channels, output_dim), dim=1)
|
||||
|
||||
self.loss_type = loss_type
|
||||
self.output_dim = output_dim
|
||||
|
@ -781,6 +799,7 @@ class ConditionalWaveNet(nn.Layer):
|
|||
This is only used for computing loss when ``loss_type`` is "mog", If
|
||||
the predicted log scale is less than -9.0, it is clipped at -9.0.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
upsample_factors: List[int],
|
||||
n_stack: int,
|
||||
|
@ -793,7 +812,8 @@ class ConditionalWaveNet(nn.Layer):
|
|||
log_scale_min: float=-9.0):
|
||||
super(ConditionalWaveNet, self).__init__()
|
||||
self.encoder = UpsampleNet(upsample_factors)
|
||||
self.decoder = WaveNet(n_stack=n_stack,
|
||||
self.decoder = WaveNet(
|
||||
n_stack=n_stack,
|
||||
n_loop=n_loop,
|
||||
residual_channels=residual_channels,
|
||||
output_dim=output_dim,
|
||||
|
@ -943,8 +963,7 @@ class ConditionalWaveNet(nn.Layer):
|
|||
ConditionalWaveNet
|
||||
The model built from pretrained result.
|
||||
"""
|
||||
model = cls(
|
||||
upsample_factors=config.model.upsample_factors,
|
||||
model = cls(upsample_factors=config.model.upsample_factors,
|
||||
n_stack=config.model.n_stack,
|
||||
n_loop=config.model.n_loop,
|
||||
residual_channels=config.model.residual_channels,
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
@ -86,6 +100,7 @@ class STFT(nn.Layer):
|
|||
Ony ``center`` and ``reflect`` padding is supported now.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, n_fft, hop_length, win_length, window="hanning"):
|
||||
super(STFT, self).__init__()
|
||||
self.hop_length = hop_length
|
||||
|
@ -109,7 +124,8 @@ class STFT(nn.Layer):
|
|||
(self.n_bin, 1, 1, self.n_fft))
|
||||
|
||||
w = np.concatenate([w_real, w_imag], axis=0)
|
||||
self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
|
||||
self.weight = paddle.cast(
|
||||
paddle.to_tensor(w), paddle.get_default_dtype())
|
||||
|
||||
def forward(self, x):
|
||||
"""Compute the stft transform.
|
||||
|
|
|
@ -20,6 +20,7 @@ __all__ = [
|
|||
"Conv1dBatchNorm",
|
||||
]
|
||||
|
||||
|
||||
class Conv1dCell(nn.Conv1D):
|
||||
"""A subclass of Conv1D layer, which can be used in an autoregressive
|
||||
decoder like an RNN cell.
|
||||
|
@ -231,6 +232,7 @@ class Conv1dBatchNorm(nn.Layer):
|
|||
epsilon : [type], optional
|
||||
The epsilon of the BatchNorm1D layer, by default 1e-05
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
|
|
|
@ -1,6 +1,21 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
|
||||
def shuffle_dim(x, axis, perm=None):
|
||||
"""Permute input tensor along aixs given the permutation or randomly.
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numba
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
@ -11,6 +25,7 @@ __all__ = [
|
|||
"diagonal_loss",
|
||||
]
|
||||
|
||||
|
||||
def weighted_mean(input, weight):
|
||||
"""Weighted mean. It can also be used as masked mean.
|
||||
|
||||
|
@ -88,8 +103,7 @@ def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
|
|||
return loss
|
||||
|
||||
|
||||
def diagonal_loss(
|
||||
attentions,
|
||||
def diagonal_loss(attentions,
|
||||
input_lengths,
|
||||
target_lengths,
|
||||
g=0.2,
|
||||
|
@ -133,6 +147,7 @@ def diagonal_loss(
|
|||
else:
|
||||
return paddle.mean(attentions * paddle.unsqueeze(W_tensor, 1))
|
||||
|
||||
|
||||
@numba.jit(nopython=True)
|
||||
def guided_attention(N, max_N, T, max_T, g):
|
||||
W = np.zeros((max_T, max_N), dtype=np.float32)
|
||||
|
@ -142,6 +157,7 @@ def guided_attention(N, max_N, T, max_T, g):
|
|||
# (T_dec, T_enc)
|
||||
return W
|
||||
|
||||
|
||||
def guided_attentions(input_lengths, target_lengths, g=0.2):
|
||||
B = len(input_lengths)
|
||||
max_input_len = input_lengths.max()
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle.fluid.layers import sequence_mask
|
||||
|
||||
|
@ -8,6 +22,7 @@ __all__ = [
|
|||
"future_mask",
|
||||
]
|
||||
|
||||
|
||||
def id_mask(input, padding_index=0, dtype="bool"):
|
||||
"""Generate mask with input ids.
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
@ -5,6 +19,7 @@ from paddle.nn import functional as F
|
|||
|
||||
__all__ = ["positional_encoding"]
|
||||
|
||||
|
||||
def positional_encoding(start_index, length, size, dtype=None):
|
||||
r"""Generate standard positional encoding matrix.
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
@ -12,6 +26,7 @@ __all__ = [
|
|||
"TransformerDecoderLayer",
|
||||
]
|
||||
|
||||
|
||||
class PositionwiseFFN(nn.Layer):
|
||||
"""A faithful implementation of Position-wise Feed-Forward Network
|
||||
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
||||
|
@ -30,10 +45,8 @@ class PositionwiseFFN(nn.Layer):
|
|||
The probability of the Dropout applied to the output of the first
|
||||
layer, by default 0.
|
||||
"""
|
||||
def __init__(self,
|
||||
input_size: int,
|
||||
hidden_size: int,
|
||||
dropout=0.0):
|
||||
|
||||
def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
|
||||
super(PositionwiseFFN, self).__init__()
|
||||
self.linear1 = nn.Linear(input_size, hidden_size)
|
||||
self.linear2 = nn.Linear(hidden_size, input_size)
|
||||
|
@ -86,6 +99,7 @@ class TransformerEncoderLayer(nn.Layer):
|
|||
------
|
||||
It uses the PostLN (post layer norm) scheme.
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
||||
super(TransformerEncoderLayer, self).__init__()
|
||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
||||
|
@ -118,14 +132,12 @@ class TransformerEncoderLayer(nn.Layer):
|
|||
"""
|
||||
context_vector, attn_weights = self.self_mha(x, x, x, mask)
|
||||
x = self.layer_norm1(
|
||||
F.dropout(x + context_vector,
|
||||
self.dropout,
|
||||
training=self.training))
|
||||
F.dropout(
|
||||
x + context_vector, self.dropout, training=self.training))
|
||||
|
||||
x = self.layer_norm2(
|
||||
F.dropout(x + self.ffn(x),
|
||||
self.dropout,
|
||||
training=self.training))
|
||||
F.dropout(
|
||||
x + self.ffn(x), self.dropout, training=self.training))
|
||||
return x, attn_weights
|
||||
|
||||
|
||||
|
@ -155,6 +167,7 @@ class TransformerDecoderLayer(nn.Layer):
|
|||
------
|
||||
It uses the PostLN (post layer norm) scheme.
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
||||
|
@ -197,20 +210,19 @@ class TransformerDecoderLayer(nn.Layer):
|
|||
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
|
||||
Decoder-encoder cross attention.
|
||||
"""
|
||||
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
|
||||
context_vector, self_attn_weights = self.self_mha(q, q, q,
|
||||
decoder_mask)
|
||||
q = self.layer_norm1(
|
||||
F.dropout(q + context_vector,
|
||||
self.dropout,
|
||||
training=self.training))
|
||||
F.dropout(
|
||||
q + context_vector, self.dropout, training=self.training))
|
||||
|
||||
context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask)
|
||||
context_vector, cross_attn_weights = self.cross_mha(q, k, v,
|
||||
encoder_mask)
|
||||
q = self.layer_norm2(
|
||||
F.dropout(q + context_vector,
|
||||
self.dropout,
|
||||
training=self.training))
|
||||
F.dropout(
|
||||
q + context_vector, self.dropout, training=self.training))
|
||||
|
||||
q = self.layer_norm3(
|
||||
F.dropout(q + self.ffn(q),
|
||||
self.dropout,
|
||||
training=self.training))
|
||||
F.dropout(
|
||||
q + self.ffn(q), self.dropout, training=self.training))
|
||||
return q, self_attn_weights, cross_attn_weights
|
||||
|
|
|
@ -1,2 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.training.cli import *
|
||||
from parakeet.training.experiment import *
|
||||
|
|
|
@ -1,5 +1,20 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
|
||||
|
||||
def default_argument_parser():
|
||||
r"""A simple yet genral argument parser for experiments with parakeet.
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode
|
||||
|
||||
_C = CfgNode(
|
||||
|
@ -5,8 +19,8 @@ _C = CfgNode(
|
|||
valid_interval=1000, # validation
|
||||
save_interval=10000, # checkpoint
|
||||
max_iteration=900000, # max iteration to train
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
|
||||
def get_default_training_config():
|
||||
return _C.clone()
|
||||
|
|
|
@ -27,6 +27,7 @@ from parakeet.utils import checkpoint, mp_tools
|
|||
|
||||
__all__ = ["ExperimentBase"]
|
||||
|
||||
|
||||
class ExperimentBase(object):
|
||||
"""
|
||||
An experiment template in order to structure the training code and take
|
||||
|
|
|
@ -45,6 +45,7 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
|
|||
|
||||
return iteration
|
||||
|
||||
|
||||
def _save_checkpoint(checkpoint_dir: str, iteration: int):
|
||||
"""Save the iteration number of the latest model to be checkpointed.
|
||||
|
||||
|
@ -60,6 +61,7 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
|
|||
with open(checkpoint_record, "wt") as handle:
|
||||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||
|
||||
|
||||
def load_parameters(model,
|
||||
optimizer=None,
|
||||
checkpoint_dir=None,
|
||||
|
@ -97,18 +99,19 @@ def load_parameters(model,
|
|||
params_path = checkpoint_path + ".pdparams"
|
||||
model_dict = paddle.load(params_path)
|
||||
model.set_state_dict(model_dict)
|
||||
print("[checkpoint] Rank {}: loaded model from {}".format(
|
||||
local_rank, params_path))
|
||||
print("[checkpoint] Rank {}: loaded model from {}".format(local_rank,
|
||||
params_path))
|
||||
|
||||
optimizer_path = checkpoint_path + ".pdopt"
|
||||
if optimizer and os.path.isfile(optimizer_path):
|
||||
optimizer_dict = paddle.load(optimizer_path)
|
||||
optimizer.set_state_dict(optimizer_dict)
|
||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".
|
||||
format(local_rank, optimizer_path))
|
||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
||||
local_rank, optimizer_path))
|
||||
|
||||
return iteration
|
||||
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||
"""Checkpoint the latest trained model parameters.
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from paddle.framework import core
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ def summary(layer: nn.Layer):
|
|||
print("layer has {} parameters, {} elements.".format(num_params,
|
||||
num_elements))
|
||||
|
||||
|
||||
def gradient_norm(layer: nn.Layer):
|
||||
grad_norm_dict = {}
|
||||
for name, param in layer.state_dict().items():
|
||||
|
@ -36,6 +37,7 @@ def gradient_norm(layer: nn.Layer):
|
|||
grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
|
||||
return grad_norm_dict
|
||||
|
||||
|
||||
def recursively_remove_weight_norm(layer: nn.Layer):
|
||||
for layer in layer.sublayers():
|
||||
try:
|
||||
|
@ -44,10 +46,12 @@ def recursively_remove_weight_norm(layer: nn.Layer):
|
|||
# ther is not weight norm hoom in this layer
|
||||
pass
|
||||
|
||||
|
||||
def freeze(layer: nn.Layer):
|
||||
for param in layer.parameters():
|
||||
param.trainable = False
|
||||
|
||||
|
||||
def unfreeze(layer: nn.Layer):
|
||||
for param in layer.parameters():
|
||||
param.trainable = True
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from functools import wraps
|
||||
|
@ -16,6 +30,3 @@ def rank_zero_only(func):
|
|||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
__all__ = ["SchedulerBase", "Constant", "PieceWise", "StepWise"]
|
||||
|
@ -59,4 +73,3 @@ class StepWise(SchedulerBase):
|
|||
if i == 0:
|
||||
return self.ys[0]
|
||||
return self.ys[i - 1]
|
||||
|
||||
|
|
10
setup.py
10
setup.py
|
@ -48,7 +48,6 @@ setup_info = dict(
|
|||
description='Speech synthesis tools and models based on Paddlepaddle',
|
||||
long_description=long_description,
|
||||
license='Apache 2',
|
||||
|
||||
python_requires='>=3.6',
|
||||
install_requires=[
|
||||
'numpy',
|
||||
|
@ -71,14 +70,11 @@ setup_info = dict(
|
|||
'yacs',
|
||||
'tensorboardX',
|
||||
],
|
||||
extras_require={
|
||||
'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"],
|
||||
},
|
||||
extras_require={'doc': ["sphinx", "sphinx-rtd-theme", "numpydoc"], },
|
||||
|
||||
# Package info
|
||||
packages=find_packages(exclude=('tests', 'tests.*')),
|
||||
zip_safe=True,
|
||||
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'Intended Audience :: Developers',
|
||||
|
@ -86,8 +82,6 @@ setup_info = dict(
|
|||
'License :: OSI Approved :: Apache2 License',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
],
|
||||
|
||||
)
|
||||
], )
|
||||
|
||||
setup(**setup_info)
|
||||
|
|
Loading…
Reference in New Issue