add license

This commit is contained in:
lifuchen 2020-02-26 21:03:51 +08:00
parent f84d6bec91
commit 9d79699432
92 changed files with 3322 additions and 1455 deletions

View File

@ -25,3 +25,11 @@
files: \.md$
- id: remove-tabs
files: \.md$
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$

View File

@ -112,4 +112,3 @@ example script:
```bash
python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
```

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import csv
from pathlib import Path
@ -79,10 +93,11 @@ class Transform(object):
y = signal.lfilter([1., -self.preemphasis], [1.], wav)
# STFT
D = librosa.stft(y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
D = librosa.stft(
y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
S = np.abs(D)
# to db and normalize to 0-1
@ -96,11 +111,8 @@ class Transform(object):
# mel scale and to db and normalize to 0-1,
# CAUTION: pass linear scale S, not dbscaled S
S_mel = librosa.feature.melspectrogram(S=S,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
power=1.)
S_mel = librosa.feature.melspectrogram(
S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
S_mel = 20 * np.log10(np.maximum(amplitude_min,
S_mel)) - self.ref_level_db
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
@ -148,20 +160,18 @@ class DataCollector(object):
(mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, num_frames) = example
text_sequences.append(
np.pad(mix_grapheme_phonemes,
(0, max_text_length - text_length)))
np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
)))
lin_specs.append(
np.pad(S_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
self._pad_begin - num_frames))))
mel_specs.append(
np.pad(S_mel_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
self._pad_begin - num_frames))))
done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
(0, max_decoder_length -
int(np.ceil(num_frames // self._factor))),
(0, max_decoder_length - int(
np.ceil(num_frames // self._factor))),
constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs),

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import ruamel.yaml
@ -22,11 +36,8 @@ if __name__ == "__main__":
parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
parser.add_argument("text", type=str, help="text file to synthesize")
parser.add_argument("output_path", type=str, help="path to save results")
parser.add_argument("-g",
"--device",
type=int,
default=-1,
help="device to use")
parser.add_argument(
"-g", "--device", type=int, default=-1, help="device to use")
args = parser.parse_args()
with open(args.config, 'rt') as f:
@ -76,15 +87,14 @@ if __name__ == "__main__":
window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
padding_idx, embedding_std, max_positions, n_vocab,
freeze_embedding, filter_size, encoder_channels,
n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate,
window_backward, window_ahead, key_projection,
value_projection, downsample_factor, linear_dim,
use_decoder_states, converter_channels, dropout)
dv3 = make_model(
n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
embedding_std, max_positions, n_vocab, freeze_embedding,
filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, window_backward,
window_ahead, key_projection, value_projection, downsample_factor,
linear_dim, use_decoder_states, converter_channels, dropout)
summary(dv3)
state, _ = dg.load_dygraph(args.checkpoint)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import ruamel.yaml

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
from matplotlib import cm
@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
converter_channels, dropout):
"""just a simple function to create a deepvoice 3 model"""
if n_speakers > 1:
spe = dg.Embedding((n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
spe = dg.Embedding(
(n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
else:
spe = None
@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
)
enc = Encoder(n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
dropout=dropout)
ConvSpec(h, k, 3), )
enc = Encoder(
n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
dropout=dropout)
if freeze_embedding:
freeze(enc.embed)
@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
)
ConvSpec(h, k, 1), )
attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True]
dec = Decoder(n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=r,
max_positions=max_positions,
padding_idx=padding_idx,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=WindowRange(window_behind, window_ahead),
key_projection=key_projection,
value_projection=value_projection)
dec = Decoder(
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=r,
max_positions=max_positions,
padding_idx=padding_idx,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=WindowRange(window_behind, window_ahead),
key_projection=key_projection,
value_projection=value_projection)
if not trainable_positional_encodings:
freeze(dec.embed_keys_positions)
freeze(dec.embed_query_positions)
@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3),
)
cvt = Converter(n_speakers,
speaker_dim,
dec.state_dim if use_decoder_states else mel_dim,
linear_dim,
time_upsampling=downsample_factor,
convolutions=postnet_convolutions,
dropout=dropout)
ConvSpec(2 * h, k, 3), )
cvt = Converter(
n_speakers,
speaker_dim,
dec.state_dim if use_decoder_states else mel_dim,
linear_dim,
time_upsampling=downsample_factor,
convolutions=postnet_convolutions,
dropout=dropout)
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
return dv3
@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length,
preemphasis):
"""generate waveform from text using a deepvoice 3 model"""
text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
dtype=np.int64)
text = np.array(
en.text_to_sequence(
text, p=replace_pronounciation_prob),
dtype=np.int64)
length = len(text)
print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length)
@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
"""
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
wav = librosa.griffinlim(
lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
if preemphasis > 0:
wav = signal.lfilter([1.], [1., -preemphasis], wav)
return wav
@ -225,28 +243,30 @@ def save_state(save_dir,
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"target_mel_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "target_mel_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("target/mel_spec",
cm.viridis(mel_input),
global_step,
dataformats="HWC")
writer.add_image(
"target/mel_spec",
cm.viridis(mel_input),
global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3))
display.specshow(mel_output)
plt.colorbar()
plt.title("mel_output")
plt.savefig(
os.path.join(
path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("predicted/mel_spec",
cm.viridis(mel_output),
global_step,
dataformats="HWC")
writer.add_image(
"predicted/mel_spec",
cm.viridis(mel_output),
global_step,
dataformats="HWC")
if lin_input is not None and lin_output is not None:
lin_input = lin_input[0].numpy().T
@ -258,28 +278,30 @@ def save_state(save_dir,
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"target_lin_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "target_lin_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("target/lin_spec",
cm.viridis(lin_input),
global_step,
dataformats="HWC")
writer.add_image(
"target/lin_spec",
cm.viridis(lin_input),
global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3))
display.specshow(lin_output)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(
path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("predicted/lin_spec",
cm.viridis(lin_output),
global_step,
dataformats="HWC")
writer.add_image(
"predicted/lin_spec",
cm.viridis(lin_output),
global_step,
dataformats="HWC")
if alignments is not None and len(alignments.shape) == 4:
path = os.path.join(save_dir, "alignments")
@ -290,10 +312,11 @@ def save_state(save_dir,
"train_attn_layer_{}_step_{}.png".format(idx, global_step))
plot_alignment(attn_layer, save_path)
writer.add_image("train_attn/layer_{}".format(idx),
cm.viridis(attn_layer),
global_step,
dataformats="HWC")
writer.add_image(
"train_attn/layer_{}".format(idx),
cm.viridis(attn_layer),
global_step,
dataformats="HWC")
if lin_output is not None:
wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
@ -302,7 +325,5 @@ def save_state(save_dir,
save_path = os.path.join(
path, "train_sample_step_{:09d}.wav".format(global_step))
sf.write(save_path, wav, sample_rate)
writer.add_audio("train_sample",
wav,
global_step,
sample_rate=sample_rate)
writer.add_audio(
"train_sample", wav, global_step, sample_rate=sample_rate)

View File

@ -1,36 +1,90 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
parser.add_argument(
'--config_path',
type=str,
default='config/fastspeech.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
parser.add_argument(
'--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000,
parser.add_argument(
'--fastspeech_step',
type=int,
default=70000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1,
parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
parser.add_argument(
'--transtts_path',
type=str,
default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000,
parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="the step to load transformerTTS model.")

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from tensorboardX import SummaryWriter
from collections import OrderedDict
@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
from parakeet import audio
from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict
def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'synthesis')
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
@ -37,15 +52,19 @@ def synthesis(text_input, args):
with dg.guard(place):
model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
model.set_dict(
load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech")))
model.eval()
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
mel_output, mel_output_postnet = model(
text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'],
@ -53,8 +72,8 @@ def synthesis(text_input, args):
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'],
win_length=cfg['audio']['win_length'],
hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'],
signal_norm=True,
@ -67,12 +86,15 @@ def synthesis(text_input, args):
do_trim_silence=False,
sound_norm=False)
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
mel_output_postnet = fluid.layers.transpose(
fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
))
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!")
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import os
@ -20,8 +33,10 @@ import sys
sys.path.append("../transformer_tts")
from data import LJSpeechLoader
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict()
for param in model_dict:
if param.startswith('_layers.'):
@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict
def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@ -43,26 +59,33 @@ def main(args):
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'fastspeech')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
model_dict, _ = load_checkpoint(
str(args.transformer_step),
os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict)
transformerTTS.eval()
model = FastSpeech(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
model_dict, opti_dict = load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = args.fastspeech_step
@ -76,31 +99,42 @@ def main(args):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
_, _, attn_probs, _, _, _ = transformerTTS(
character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(
get_alignment(attn_probs, mel_lens, cfg[
'transformer_head'])).astype(np.float32)
global_step += 1
#Forward
result= model(character,
pos_text,
mel_pos=pos_mel,
length_target=alignment)
result = model(
character,
pos_text,
mel_pos=pos_mel,
length_target=alignment)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
duration_loss = layers.mean(
layers.abs(
layers.elementwise_sub(duration_predictor_output,
alignment)))
total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0:
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if local_rank == 0:
writer.add_scalar('mel_loss',
mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss',
mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss',
duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if args.use_data_parallel:
total_loss = model.scale_loss(total_loss)
@ -108,21 +142,25 @@ def main(args):
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
optimizer.minimize(
total_loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % args.save_step == 0:
# save checkpoint
if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
save_path = os.path.join(args.save_path,
'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
if local_rank == 0:
writer.close()
if __name__ =='__main__':
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser)
args = parser.parse_args()

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import numpy as np
import pandas as pd
@ -12,22 +25,42 @@ from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader:
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
def __init__(self,
config,
args,
nranks,
rank,
is_vocoder=False,
shuffle=True):
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
sampler = DistributedSampler(
len(metadata), nranks, rank, shuffle=shuffle)
assert args.batch_size % nranks == 0
each_bs = args.batch_size // nranks
if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples_vocoder,
drop_last=True)
else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples,
drop_last=True)
self.reader = fluid.io.DataLoader.from_generator(
capacity=32,
@ -68,8 +101,8 @@ class LJSpeech(object):
min_level_db=config['audio']['min_level_db'],
ref_level_db=config['audio']['ref_level_db'],
n_fft=config['audio']['n_fft'],
win_length= config['audio']['win_length'],
hop_length= config['audio']['hop_length'],
win_length=config['audio']['win_length'],
hop_length=config['audio']['hop_length'],
power=config['audio']['power'],
preemphasis=config['audio']['preemphasis'],
signal_norm=True,
@ -95,8 +128,10 @@ class LJSpeech(object):
wav = self._ljspeech_processor.load_wav(str(fname))
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
phonemes = np.array(
g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes
) # maybe we need to implement it as a map in the future
def batch_examples(batch):
@ -109,7 +144,10 @@ def batch_examples(batch):
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
mel_inputs.append(
np.concatenate(
[np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
axis=-1))
mel_lens.append(mel.shape[1])
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
@ -118,35 +156,59 @@ def batch_examples(batch):
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
texts = [
i
for i, _ in sorted(
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
]
mels = [
i
for i, _ in sorted(
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
]
mel_inputs = [
i
for i, _ in sorted(
zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
]
mel_lens = [
i
for i, _ in sorted(
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
]
pos_texts = [
i
for i, _ in sorted(
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
]
pos_mels = [
i
for i, _ in sorted(
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
mel_inputs = np.transpose(
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
np.array(mel_lens))
def batch_examples_vocoder(batch):
mels=[]
mags=[]
mels = []
mags = []
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
return (mels, mags)

View File

@ -1,38 +1,100 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
parser.add_argument(
'--config_path',
type=str,
default='config/train_transformer.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
parser.add_argument(
'--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
parser.add_argument(
'--image_step',
type=int,
default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
parser.add_argument(
'--max_len',
type=int,
default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="Global step to restore checkpoint of transformer.")
parser.add_argument('--vocoder_step', type=int, default=90000,
parser.add_argument(
'--vocoder_step',
type=int,
default=90000,
help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=int, default=1,
parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=int, default=0,
parser.add_argument(
'--stop_token',
type=int,
default=0,
help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.")

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
@ -16,6 +29,7 @@ from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict
def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
@ -34,36 +49,43 @@ def synthesis(text_input, args):
# tensorboard
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'synthesis')
writer = SummaryWriter(path)
with dg.guard(place):
with fluid.unique_name.guard():
model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
model.set_dict(
load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer")))
model.eval()
with fluid.unique_name.guard():
model_vocoder = Vocoder(cfg, args.batch_size)
model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
model_vocoder.set_dict(
load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder")))
model_vocoder.eval()
# init input
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
pbar = tqdm(range(args.max_len))
for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
pos_mel = np.arange(1, mel_input.shape[1] + 1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat(
[mel_input, postnet_pred[:, -1:, :]], axis=1)
mag_pred = model_vocoder(postnet_pred)
_ljspeech_processor = audio.AudioProcessor(
@ -72,8 +94,8 @@ def synthesis(text_input, args):
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'],
win_length=cfg['audio']['win_length'],
hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'],
signal_norm=True,
@ -86,13 +108,18 @@ def synthesis(text_input, args):
do_trim_silence=False,
sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
wav = _ljspeech_processor.inv_spectrogram(
fluid.layers.transpose(
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(args.sample_path):
os.mkdir(args.sample_path)
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
write(
os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
wav)
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from tqdm import tqdm
from tensorboardX import SummaryWriter
@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
from data import LJSpeechLoader
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict()
for param in model_dict:
if param.startswith('_layers.'):
@ -40,8 +55,8 @@ def main(args):
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'transformer')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None
@ -49,13 +64,18 @@ def main(args):
model = TransformerTTS(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
model_dict, opti_dict = load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = args.transformer_step
@ -68,60 +88,82 @@ def main(args):
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
character, mel_input, pos_text, pos_mel)
label = (pos_mel == 0).astype(np.float32)
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work.
if args.stop_token:
stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss
if local_rank==0:
if local_rank == 0:
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy()
'mel_loss': mel_loss.numpy(),
'post_mel_loss': post_mel_loss.numpy()
}, global_step)
if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
writer.add_scalar('stop_loss',
stop_loss.numpy(), global_step)
if args.use_data_parallel:
writer.add_scalars('alphas', {
'encoder_alpha':model._layers.encoder.alpha.numpy(),
'decoder_alpha':model._layers.decoder.alpha.numpy(),
'encoder_alpha':
model._layers.encoder.alpha.numpy(),
'decoder_alpha':
model._layers.decoder.alpha.numpy(),
}, global_step)
else:
writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(),
'encoder_alpha': model.encoder.alpha.numpy(),
'decoder_alpha': model.decoder.alpha.numpy(),
}, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
x = np.uint8(
cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_enc):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
x = np.uint8(
cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_enc_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_dec):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
x = np.uint8(
cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_dec_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
if args.use_data_parallel:
loss = model.scale_loss(loss)
@ -129,21 +171,25 @@ def main(args):
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % args.save_step == 0:
if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
save_path = os.path.join(args.save_path,
'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
if local_rank == 0:
writer.close()
if __name__ =='__main__':
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorboardX import SummaryWriter
import os
from tqdm import tqdm
@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
from data import LJSpeechLoader
from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
@ -23,6 +37,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict
def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
@ -37,8 +52,8 @@ def main(args):
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'vocoder')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'vocoder')
writer = SummaryWriter(path) if local_rank == 0 else None
@ -46,12 +61,15 @@ def main(args):
model = Vocoder(cfg, args.batch_size)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
model_dict, opti_dict = load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = args.vocoder_step
@ -61,19 +79,21 @@ def main(args):
strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
pbar.set_description('Processing at epoch %d' % epoch)
mel, mag = data
mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy())
global_step += 1
mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
loss = layers.mean(
layers.abs(layers.elementwise_sub(mag_pred, mag)))
if args.use_data_parallel:
loss = model.scale_loss(loss)
@ -81,24 +101,29 @@ def main(args):
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients()
if local_rank==0:
writer.add_scalars('training_loss',{
'loss':loss.numpy(),
if local_rank == 0:
writer.add_scalars('training_loss', {
'loss': loss.numpy(),
}, global_step)
if global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
save_path = os.path.join(args.save_path,
'vocoder/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
if local_rank == 0:
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train vocoder model")
add_config_options_to_parser(parser)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from pprint import pprint

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from pprint import pprint

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import subprocess

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import time

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.0.0"
from . import data, g2p, models, modules

View File

@ -1 +1,15 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor

View File

@ -1,30 +1,46 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
class AudioProcessor(object):
def __init__(self,
sample_rate=None, # int, sampling rate
num_mels=None, # int, bands of mel spectrogram
min_level_db=None, # float, minimum level db
ref_level_db=None, # float, reference level db
n_fft=None, # int: number of samples in a frame for stft
win_length=None, # int: the same meaning with n_fft
hop_length=None, # int: number of samples between neighboring frame
power=None, # float:power to raise before griffin-lim
preemphasis=None, # float: preemphasis coefficident
signal_norm=None, #
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
max_norm=None, # float, max norm
mel_fmin=None, # int: mel spectrogram's minimum frequency
mel_fmax=None, # int: mel spectrogram's maximum frequency
clip_norm=True, # bool: clip spectrogram's norm
griffin_lim_iters=None, # int:
do_trim_silence=False, # bool: trim silence
sound_norm=False,
**kwargs):
def __init__(
self,
sample_rate=None, # int, sampling rate
num_mels=None, # int, bands of mel spectrogram
min_level_db=None, # float, minimum level db
ref_level_db=None, # float, reference level db
n_fft=None, # int: number of samples in a frame for stft
win_length=None, # int: the same meaning with n_fft
hop_length=None, # int: number of samples between neighboring frame
power=None, # float:power to raise before griffin-lim
preemphasis=None, # float: preemphasis coefficident
signal_norm=None, #
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
max_norm=None, # float, max norm
mel_fmin=None, # int: mel spectrogram's minimum frequency
mel_fmax=None, # int: mel spectrogram's maximum frequency
clip_norm=True, # bool: clip spectrogram's norm
griffin_lim_iters=None, # int:
do_trim_silence=False, # bool: trim silence
sound_norm=False,
**kwargs):
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db
@ -52,7 +68,8 @@ class AudioProcessor(object):
self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
)
def _stft_parameters(self):
"""compute frame length and hop length in ms"""
@ -65,44 +82,54 @@ class AudioProcessor(object):
"""object repr"""
cls_name_str = self.__class__.__name__
members = vars(self)
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()])
dict_str = "\n".join(
[" {}: {},".format(k, v) for k, v in members.items()])
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
return repr_str
def save_wav(self, path, wav):
"""save audio with scipy.io.wavfile in 16bit integers"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
scipy.io.wavfile.write(path, self.sample_rate,
wav_norm.as_type(np.int16))
def load_wav(self, path, sr=None):
"""load wav -> trim_silence -> rescale"""
x, sr = librosa.load(path, sr=None)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
sr, self.sample_rate)
if self.do_trim_silence:
try:
x = self.trim_silence(x)
except ValueError:
print(" [!] File cannot be trimmed for silence - {}".format(path))
print(" [!] File cannot be trimmed for silence - {}".format(
path))
if self.sound_norm:
x = x / x.max() * 0.9 # why 0.9 ?
x = x / x.max() * 0.9 # why 0.9 ?
return x
def trim_silence(self, wav):
"""Trim soilent parts with a threshold and 0.01s margin"""
margin = int(self.sample_rate * 0.01)
wav = wav[margin: -margin]
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
wav = wav[margin:-margin]
trimed_wav = librosa.effects.trim(
wav,
top_db=60,
frame_length=self.win_length,
hop_length=self.hop_length)[0]
return trimed_wav
def apply_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
def _amplitude_to_db(self, x):
@ -125,12 +152,11 @@ class AudioProcessor(object):
"""return mel basis for mel scale"""
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
return librosa.filters.mel(self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _normalize(self, S):
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
@ -156,12 +182,15 @@ class AudioProcessor(object):
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
S_denorm = (S_denorm + self.max_norm) * (
-self.min_level_db) / (2 * self.max_norm
) + self.min_level_db
return S_denorm
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
S_denorm = S_denorm * (-self.min_level_db
) / self.max_norm + self.min_level_db
return S_denorm
else:
return S
@ -174,7 +203,8 @@ class AudioProcessor(object):
hop_length=self.hop_length)
def _istft(self, S):
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
return librosa.istft(
S, hop_length=self.hop_length, win_length=self.win_length)
def spectrogram(self, y):
"""compute linear spectrogram(amplitude)
@ -195,7 +225,8 @@ class AudioProcessor(object):
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
S = self._amplitude_to_db(self._linear_to_mel(np.abs(
D))) - self.ref_level_db
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
@ -203,16 +234,16 @@ class AudioProcessor(object):
S = self._denormalize(spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram):
S = self._denormalize(mel_spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
S = self._mel_to_linear(np.abs(S))
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
"""convert output linear spec to mel spec"""
@ -234,18 +265,18 @@ class AudioProcessor(object):
@staticmethod
def mulaw_encode(wav, qc):
mu = 2 ** qc - 1
mu = 2**qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,)
return np.floor(signal, )
@staticmethod
def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values."""
mu = 2 ** qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
mu = 2**qc - 1
x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
return x
@staticmethod

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .dataset import *
from .datacargo import *
from .sampler import *

View File

@ -1,10 +1,25 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
functions to make batch for arrays which satisfy some conditions.
"""
import numpy as np
class TextIDBatcher(object):
"""A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
def __init__(self, pad_id=0, dtype=np.int64):
self.pad_id = pad_id
self.dtype = dtype
@ -13,6 +28,7 @@ class TextIDBatcher(object):
out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
return out
def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
"""
minibatch: List[Example]
@ -21,16 +37,21 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
peek_example = minibatch[0]
assert len(peek_example.shape) == 1, "text example is an 1D tensor"
lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
lengths = [example.shape[0] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths)
batch = []
for example in minibatch:
pad_len = max_len - example.shape[0]
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_id))
return np.array(batch, dtype=dtype)
class WavBatcher(object):
def __init__(self, pad_value=0., dtype=np.float32):
self.pad_value = pad_value
@ -40,6 +61,7 @@ class WavBatcher(object):
out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out
def batch_wav(minibatch, pad_value=0., dtype=np.float32):
"""
minibatch: List[Example]
@ -52,16 +74,23 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
elif len(peek_example.shape) == 2:
mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
lengths = [example.shape[-1] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths)
batch = []
for example in minibatch:
pad_len = max_len - example.shape[-1]
if mono_channel:
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_value))
else:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)
@ -75,6 +104,7 @@ class SpecBatcher(object):
out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out
def batch_spec(minibatch, pad_value=0., dtype=np.float32):
"""
minibatch: List[Example]
@ -87,15 +117,22 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
elif len(peek_example.shape) == 3:
mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
lengths = [example.shape[-1] for example in minibatch
] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
batch = []
for example in minibatch:
pad_len = max_len - example.shape[-1]
if mono_channel:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value))
else:
batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
batch.append(
np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
from .sampler import SequentialSampler, RandomSampler, BatchSampler

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import numpy as np
@ -9,8 +23,7 @@ class DatasetMixin(object):
if isinstance(index, slice):
start, stop, step = index.indices(len(self))
return [
self.get_example(i)
for i in six.moves.range(start, stop, step)
self.get_example(i) for i in six.moves.range(start, stop, step)
]
elif isinstance(index, (list, np.ndarray)):
return [self.get_example(i) for i in index]
@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
def get_example(self, i):
if i < 0:
raise IndexError(
"ChainDataset doesnot support negative indexing.")
raise IndexError("ChainDataset doesnot support negative indexing.")
for dataset in self._datasets:
if i < len(dataset):

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
So the sampler is only responsible for generating valid indices.
"""
import numpy as np
import random
class Sampler(object):
def __init__(self, data_source):
pass
@ -42,12 +55,14 @@ class RandomSampler(Sampler):
"replacement={}".format(self.replacement))
if self._num_samples is not None and not replacement:
raise ValueError("With replacement=False, num_samples should not be specified, "
"since a random permutation will be performed.")
raise ValueError(
"With replacement=False, num_samples should not be specified, "
"since a random permutation will be performed.")
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(self.num_samples))
"value, but got num_samples={}".format(
self.num_samples))
@property
def num_samples(self):
@ -59,7 +74,9 @@ class RandomSampler(Sampler):
def __iter__(self):
n = len(self.data_source)
if self.replacement:
return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
return iter(
np.random.randint(
0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
return iter(np.random.permutation(n).tolist())
def __len__(self):
@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
self.indices = indices
def __iter__(self):
return (self.indices[i] for i in np.random.permutation(len(self.indices)))
return (self.indices[i]
for i in np.random.permutation(len(self.indices)))
def __len__(self):
return len(self.indices)
@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
3. Permutate mini-batchs
"""
def __init__(self, lengths, batch_size=4, batch_group_size=None,
def __init__(self,
lengths,
batch_size=4,
batch_group_size=None,
permutate=True):
_lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
_lengths = np.array(
lengths,
dtype=np.int64) # maybe better implement length as a sort key
self.lengths = np.sort(_lengths)
self.sorted_indices = np.argsort(_lengths)
@ -112,13 +135,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
for i in range(len(indices) // batch_group_size):
s = i * batch_group_size
e = s + batch_group_size
random.shuffle(indices[s: e]) # inplace
random.shuffle(indices[s:e]) # inplace
# Permutate batches
if self.permutate:
perm = np.arange(len(indices[:e]) // self.batch_size)
random.shuffle(perm)
indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
indices[:e] = indices[:e].reshape(
-1, self.batch_size)[perm, :].reshape(-1)
# Handle last elements
s += batch_group_size
@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
def __init__(self, weights, num_samples, replacement):
if not isinstance(num_samples, int) or num_samples <= 0:
raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(num_samples))
"value, but got num_samples={}".format(
num_samples))
self.weights = np.array(weights, dtype=np.float64)
self.num_samples = num_samples
self.replacement = replacement
def __iter__(self):
return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),
replace=self.replacement, p=self.weights).tolist())
return iter(
np.random.choice(
len(self.weights),
size=(self.num_samples, ),
replace=self.replacement,
p=self.weights).tolist())
def __len__(self):
return self.num_samples
@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
# Subset samples for each trainer.
indices = indices[self.rank:self.total_size:self.num_trainers]
assert len(indices) == self.num_samples
assert len(indices) == self.num_samples
return iter(indices)
@ -209,8 +238,7 @@ class BatchSampler(Sampler):
def __init__(self, sampler, batch_size, drop_last):
if not isinstance(sampler, Sampler):
raise ValueError("sampler should be an instance of "
"Sampler, but got sampler={}"
.format(sampler))
"Sampler, but got sampler={}".format(sampler))
if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError("batch_size should be a positive integer value, "
"but got batch_size={}".format(batch_size))

View File

@ -15,8 +15,3 @@ One of the reasons we choose to load data lazily (only load metadata before hand
For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
That is it!

View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import numpy as np
import pandas as pd

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import pandas as pd
from ruamel.yaml import YAML
@ -11,9 +25,11 @@ from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, WavBatcher
class VCTK(Dataset):
def __init__(self, root):
assert isinstance(root, (str, Path)), "root should be a string or Path object"
assert isinstance(root, (
str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.text_root = self.root.joinpath("txt")
self.wav_root = self.root.joinpath("wav48")
@ -24,10 +40,10 @@ class VCTK(Dataset):
self.speaker_indices, self.metadata = self._load_metadata()
def _load_metadata(self):
yaml=YAML(typ='safe')
yaml = YAML(typ='safe')
speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
metadata = pd.read_csv(self.root.joinpath("metadata.csv"),
sep="|", quoting=3, header=1)
metadata = pd.read_csv(
self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
return speaker_indices, metadata
def _prepare_metadata(self):
@ -41,15 +57,19 @@ class VCTK(Dataset):
with io.open(str(text_file)) as f:
transcription = f.read().strip()
wav_file = text_file.with_suffix(".wav")
metadata.append((wav_file.name, speaker_folder.name, transcription))
metadata = pd.DataFrame.from_records(metadata,
columns=["wave_file", "speaker", "text"])
metadata.append(
(wav_file.name, speaker_folder.name, transcription))
metadata = pd.DataFrame.from_records(
metadata, columns=["wave_file", "speaker", "text"])
# save them
yaml=YAML(typ='safe')
yaml = YAML(typ='safe')
yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
metadata.to_csv(self.root.joinpath("metadata.csv"),
sep="|", quoting=3, index=False)
metadata.to_csv(
self.root.joinpath("metadata.csv"),
sep="|",
quoting=3,
index=False)
def _get_example(self, metadatum):
wave_file, speaker, text = metadatum
@ -77,5 +97,3 @@ class VCTK(Dataset):
speaker_batch = np.array(speaker_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return wav_batch, speaker_batch, phoneme_batch

View File

@ -1,5 +1,4 @@
# coding: utf-8
"""Text processing frontend
All frontend module should have the following functions:

View File

@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence
text = text_to_sequence(text, ["english_cleaners"])
return text

View File

@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence
text = text_to_sequence(text, ["basic_cleaners"])
return text

View File

@ -1,6 +1,5 @@
# coding: utf-8
import MeCab
import jaconv
from random import random
@ -30,9 +29,9 @@ def _yomi(mecab_result):
def _mix_pronunciation(tokens, yomis, p):
return "".join(
yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
for idx in range(len(tokens)))
return "".join(yomis[idx]
if yomis[idx] is not None and random() < p else tokens[idx]
for idx in range(len(tokens)))
def mix_pronunciation(text, p):
@ -59,8 +58,7 @@ def normalize_delimitor(text):
def text_to_sequence(text, p=0.0):
for c in [" ", " ", "", "", "", "", "", "", "",
"", "", "(", ")"]:
for c in [" ", " ", "", "", "", "", "", "", "", "", "", "(", ")"]:
text = text.replace(c, "")
text = text.replace("!", "")
text = text.replace("?", "")

View File

@ -1,6 +1,5 @@
# coding: utf-8
from random import random
n_vocab = 0xffff
@ -13,5 +12,6 @@ _tagger = None
def text_to_sequence(text, p=0.0):
return [ord(c) for c in text] + [_eos] # EOS
def sequence_to_text(seq):
return "".join(chr(n) for n in seq)

View File

@ -1,8 +1,21 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from . import cleaners
from .symbols import symbols
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
if not m:
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
break
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
sequence += _symbols_to_sequence(
_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Cleaners are transformations that run over the input text at both training and eval time.
@ -14,31 +27,31 @@ import re
from unidecode import unidecode
from .numbers import normalize_numbers
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):

View File

@ -1,14 +1,28 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
valid_symbols = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
'Y', 'Z', 'ZH'
]
_valid_symbol_set = set(valid_symbols)
@ -24,7 +38,10 @@ class CMUDict:
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries
def __len__(self):

View File

@ -3,7 +3,6 @@
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@ -56,7 +55,8 @@ def _expand_number(m):
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
return _inflect.number_to_words(
num, andword='', zero='oh', group=2).replace(', ', ' ')
else:
return _inflect.number_to_words(num, andword='')

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Defines the set of symbols used in text input to the model.

View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
from parakeet.models.deepvoice3.converter import Converter

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from collections import namedtuple
from paddle import fluid
@ -19,23 +33,19 @@ class Attention(dg.Layer):
value_projection=True):
super(Attention, self).__init__()
std = np.sqrt(1 / query_dim)
self.query_proj = Linear(query_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.query_proj = Linear(
query_dim, embed_dim, param_attr=I.Normal(scale=std))
if key_projection:
std = np.sqrt(1 / embed_dim)
self.key_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.key_proj = Linear(
embed_dim, embed_dim, param_attr=I.Normal(scale=std))
if value_projection:
std = np.sqrt(1 / embed_dim)
self.value_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.value_proj = Linear(
embed_dim, embed_dim, param_attr=I.Normal(scale=std))
std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim,
query_dim,
param_attr=I.Normal(scale=std))
self.out_proj = Linear(
embed_dim, query_dim, param_attr=I.Normal(scale=std))
self.key_projection = key_projection
self.value_projection = value_projection
@ -102,9 +112,8 @@ class Attention(dg.Layer):
x = F.softmax(x)
attn_scores = x
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = F.matmul(x, values)
encoder_length = keys.shape[1]
# CAUTION: is it wrong? let it be now

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
has residual connection from the input x, and scale the output by
np.sqrt(0.5).
"""
def __init__(self,
n_speakers,
speaker_dim,
@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
), "this block uses residual connection"\
"the input_channes should equals num_filters"
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
self.conv = Conv1DCell(in_channels,
2 * num_filters,
filter_size,
dilation,
causal,
param_attr=I.Normal(scale=std))
self.conv = Conv1DCell(
in_channels,
2 * num_filters,
filter_size,
dilation,
causal,
param_attr=I.Normal(scale=std))
if n_speakers > 1:
assert (speaker_dim is not None
), "speaker embed should not be null in multi-speaker case"
std = np.sqrt(1 / speaker_dim)
self.fc = Linear(speaker_dim,
num_filters,
param_attr=I.Normal(scale=std))
self.fc = Linear(
speaker_dim, num_filters, param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None):
"""
@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU.
"""
residual = x
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = self.conv(x)
content, gate = F.split(x, num_or_sections=2, dim=1)
@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU.
"""
residual = x_t
x_t = F.dropout(x_t,
self.dropout,
dropout_implementation="upscale_in_train")
x_t = F.dropout(
x_t, self.dropout, dropout_implementation="upscale_in_train")
x_t = self.conv.add_input(x_t)
content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from itertools import chain
@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout),
Conv1DTranspose(
Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout), Conv1DTranspose(
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(
4. / (2 * target_channels)))), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
@ -108,6 +125,7 @@ class Converter(dg.Layer):
Vocoder that transforms mel spectrogram (or ecoder hidden states)
to waveform.
"""
def __init__(self,
n_speakers,
speaker_dim,
@ -161,33 +179,36 @@ class Converter(dg.Layer):
std = np.sqrt(std_mul / in_channels)
# CAUTION: relu
self.convolutions.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
Conv1D(
in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation=dilation,
std_mul=std_mul,
dropout=dropout))
Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation=dilation,
std_mul=std_mul,
dropout=dropout))
in_channels = out_channels
std_mul = 4.0
# final conv proj, channel transformed to linear dim
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
# CAUTION: sigmoid
self.last_conv_proj = Conv1D(in_channels,
linear_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
self.last_conv_proj = Conv1D(
in_channels,
linear_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None):
"""

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
class Decoder(dg.Layer):
def __init__(
self,
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=1,
max_positions=512,
padding_idx=None, # remove it!
preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True,
dropout=0.0,
use_memory_mask=False,
force_monotonic_attention=False,
query_position_rate=1.0,
key_position_rate=1.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True):
self,
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=1,
max_positions=512,
padding_idx=None, # remove it!
preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True,
dropout=0.0,
use_memory_mask=False,
force_monotonic_attention=False,
query_position_rate=1.0,
key_position_rate=1.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True):
super(Decoder, self).__init__()
self.dropout = dropout
@ -111,23 +125,17 @@ class Decoder(dg.Layer):
conv_channels = convolutions[0].out_channels
# only when padding idx is 0 can we easilt handle it
self.embed_keys_positions = PositionEmbedding(max_positions,
embed_dim,
padding_idx=0)
self.embed_query_positions = PositionEmbedding(max_positions,
conv_channels,
padding_idx=0)
self.embed_keys_positions = PositionEmbedding(
max_positions, embed_dim, padding_idx=0)
self.embed_query_positions = PositionEmbedding(
max_positions, conv_channels, padding_idx=0)
if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim)
self.speaker_proj1 = Linear(speaker_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(speaker_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
self.speaker_proj1 = Linear(
speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(
speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
# prenet
self.prenet = dg.LayerList()
@ -138,24 +146,26 @@ class Decoder(dg.Layer):
# conv1d & relu
std = np.sqrt(std_mul / in_channels)
self.prenet.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
Conv1D(
in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.prenet.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=True))
Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=True))
in_channels = out_channels
std_mul = 4.0
@ -184,16 +194,17 @@ class Decoder(dg.Layer):
assert (
in_channels == out_channels
), "the stack of convolution & attention does not change channels"
conv_layer = Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=False)
conv_layer = Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=False)
attn_layer = Attention(
out_channels,
embed_dim,
@ -211,10 +222,8 @@ class Decoder(dg.Layer):
# 1 * 1 conv to transform channels
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.last_conv = Conv1D(in_channels,
mel_dim * r,
1,
param_attr=I.Normal(scale=std))
self.last_conv = Conv1D(
in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
# mel (before sigmoid) to done hat
std = np.sqrt(1 / in_channels)
@ -308,9 +317,8 @@ class Decoder(dg.Layer):
# (B, C, T)
frames = F.transpose(frames, [0, 2, 1])
x = frames
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
if isinstance(layer, Conv1DGLU):
@ -408,14 +416,13 @@ class Decoder(dg.Layer):
test_inputs = fold_adjacent_frames(test_inputs, self.r)
test_inputs = F.transpose(test_inputs, [0, 2, 1])
initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
dtype=keys.dtype)
initial_input = F.zeros(
(batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
t = 0 # decoder time step
while True:
frame_pos = F.fill_constant((batch_size, 1),
value=t + 1,
dtype="int64")
frame_pos = F.fill_constant(
(batch_size, 1), value=t + 1, dtype="int64")
w = self.query_position_rate
if self.n_speakers > 1:
w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
@ -433,9 +440,8 @@ class Decoder(dg.Layer):
current_input = initial_input
x_t = current_input
x_t = F.dropout(x_t,
self.dropout,
dropout_implementation="upscale_in_train")
x_t = F.dropout(
x_t, self.dropout, dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
@ -453,15 +459,15 @@ class Decoder(dg.Layer):
x_t = F.transpose(x_t, [0, 2, 1])
if frame_pos_embed is not None:
x_t += frame_pos_embed
x_t, attn_scores = attn(
x_t, (keys, values), mask,
last_attended[i] if test_inputs is None else None)
x_t, attn_scores = attn(x_t, (keys, values), mask,
last_attended[i]
if test_inputs is None else None)
x_t = F.transpose(x_t, [0, 2, 1])
step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc)
# update last attended when necessary
if self.force_monotonic_attention[i]:
last_attended[i] = np.argmax(attn_scores.numpy(),
axis=-1)[0][0]
last_attended[i] = np.argmax(
attn_scores.numpy(), axis=-1)[0][0]
x_t = F.scale(residual + x_t, np.sqrt(0.5))
if len(step_attn_scores):
# (B, 1, T_enc) again
@ -485,8 +491,8 @@ class Decoder(dg.Layer):
t += 1
if test_inputs is None:
if F.reduce_min(done_t).numpy(
)[0] > 0.5 and t > self.min_decoder_steps:
if F.reduce_min(done_t).numpy()[
0] > 0.5 and t > self.min_decoder_steps:
break
elif t > self.max_decoder_steps:
break

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from collections import namedtuple
@ -33,14 +47,16 @@ class Encoder(dg.Layer):
self.dropout = dropout
if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim)
self.sp_proj1 = Linear(speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.sp_proj2 = Linear(speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.sp_proj1 = Linear(
speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.sp_proj2 = Linear(
speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.n_speakers = n_speakers
self.convolutions = dg.LayerList()
@ -51,31 +67,34 @@ class Encoder(dg.Layer):
if in_channels != out_channels:
std = np.sqrt(std_mul / in_channels)
self.convolutions.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
Conv1D(
in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=False,
residual=True))
Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=False,
residual=True))
in_channels = out_channels
std_mul = 4.0
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.convolutions.append(
Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
Conv1D(
in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def forward(self, x, speaker_embed=None):
"""
@ -96,9 +115,8 @@ class Encoder(dg.Layer):
representation for values.
"""
x = self.embed(x)
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = F.transpose(x, [0, 2, 1])
if self.n_speakers > 1 and speaker_embed is not None:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from numba import jit
@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
return W
def guided_attentions(encoder_lengths,
decoder_lengths,
max_decoder_len,
def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
g=0.2):
B = len(encoder_lengths)
max_input_len = encoder_lengths.max()
@ -93,9 +105,8 @@ class TTSLoss(object):
def binary_divergence(self, prediction, target, mask):
flattened_prediction = F.reshape(prediction, [-1, 1])
flattened_target = F.reshape(target, [-1, 1])
flattened_loss = F.log_loss(flattened_prediction,
flattened_target,
epsilon=1e-8)
flattened_loss = F.log_loss(
flattened_prediction, flattened_target, epsilon=1e-8)
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
w = self.masked_weight
@ -163,23 +174,20 @@ class TTSLoss(object):
max_mel_steps = max_frames // self.downsample_factor
max_decoder_steps = max_mel_steps // self.r
decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
self.r,
max_decoder_steps,
dtype="float32")
mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
max_mel_steps,
dtype="float32")
decoder_mask = F.sequence_mask(
n_frames // self.downsample_factor // self.r,
max_decoder_steps,
dtype="float32")
mel_mask = F.sequence_mask(
n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
if compute_lin_loss:
lin_hyp = lin_hyp[:, :-self.time_shift, :]
lin_ref = lin_ref[:, self.time_shift:, :]
lin_mask = lin_mask[:, self.time_shift:, :]
lin_l1_loss = self.l1_loss(lin_hyp,
lin_ref,
lin_mask,
priority_bin=self.priority_bin)
lin_l1_loss = self.l1_loss(
lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
lin_loss = self.binary_divergence_weight * lin_bce_loss \
+ (1 - self.binary_divergence_weight) * lin_l1_loss
@ -197,9 +205,10 @@ class TTSLoss(object):
total_loss += mel_loss
if compute_attn_loss:
attn_loss = self.attention_loss(
attn_hyp, input_lengths.numpy(),
n_frames.numpy() // (self.downsample_factor * self.r))
attn_loss = self.attention_loss(attn_hyp,
input_lengths.numpy(),
n_frames.numpy() //
(self.downsample_factor * self.r))
total_loss += attn_loss
if compute_done_loss:

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid.layers as F
@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
mel_outputs, alignments, done, decoder_states = self.decoder(
(keys, values), valid_lengths, mel_inputs, text_positions,
frame_positions, speaker_embed)
linear_outputs = self.converter(
decoder_states if self.use_decoder_states else mel_outputs,
speaker_embed)
linear_outputs = self.converter(decoder_states
if self.use_decoder_states else
mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done
def transduce(self, text_sequences, text_positions, speaker_indices=None):
@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder.decode(
(keys, values), text_positions, speaker_embed)
linear_outputs = self.converter(
decoder_states if self.use_decoder_states else mel_outputs,
speaker_embed)
linear_outputs = self.converter(decoder_states
if self.use_decoder_states else
mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
import paddle.fluid.layers as F
@ -95,8 +109,9 @@ class PositionEmbedding(dg.Layer):
speaker_position_rate) # (B, V, C)
# make indices for gather_nd
batch_id = F.expand(
F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
[1, time_steps])
F.unsqueeze(
F.range(
0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
# (B, T, 2)
gather_nd_id = F.stack([batch_id, indices], -1)

View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,8 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock
class Decoder(dg.Layer):
def __init__(self,
len_max_seq,
@ -18,13 +32,26 @@ class Decoder(dg.Layer):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
self.pos_inp = get_sinusoid_encoding_table(
n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(
size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)

View File

@ -1,8 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock
class Encoder(dg.Layer):
def __init__(self,
n_src_vocab,
@ -19,14 +33,28 @@ class Encoder(dg.Layer):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
self.src_word_emb = dg.Embedding(
size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(
n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(
size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
@ -52,7 +80,8 @@ class Encoder(dg.Layer):
non_pad_mask = get_non_pad_mask(character)
# -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
enc_output = self.src_word_emb(character) + self.position_enc(
text_pos) #(N, T, C)
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder
class FastSpeech(dg.Layer):
def __init__(self, cfg):
" FastSpeech"
super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['encoder_n_layer'],
n_head=cfg['encoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg['dropout'])
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
n_layers=cfg['decoder_n_layer'],
n_head=cfg['decoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.encoder = Encoder(
n_src_vocab=len(symbols) + 1,
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['encoder_n_layer'],
n_head=cfg['encoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.length_regulator = LengthRegulator(
input_size=cfg['fs_hidden_size'],
out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg['dropout'])
self.decoder = Decoder(
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['decoder_n_layer'],
n_head=cfg['decoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
param_attr = self.weight,
bias_attr = self.bias,)
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.mel_linear = dg.Linear(
cfg['fs_hidden_size'],
cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
param_attr=self.weight,
bias_attr=self.bias, )
self.postnet = PostConvNet(
n_mels=cfg['audio']['num_mels'],
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
def forward(self,
character,
text_pos,
mel_pos=None,
length_target=None,
alpha=1.0):
"""
FastSpeech model.
@ -80,21 +106,24 @@ class FastSpeech(dg.Layer):
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
character, text_pos)
if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
target=length_target,
alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
length_regulator_output, duration_predictor_output = self.length_regulator(
encoder_output, target=length_target, alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder(
length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
else:
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
length_regulator_output, decoder_pos = self.length_regulator(
encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output,
decoder_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import paddle.fluid.dygraph as dg
@ -6,11 +19,32 @@ import paddle.fluid as fluid
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
def __init__(self,
d_model,
d_inner,
n_head,
d_k,
d_v,
filter_size,
padding,
dropout=0.2):
super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
self.slf_attn = MultiheadAttention(
d_model,
d_k,
d_v,
num_head=n_head,
is_bias=True,
dropout=dropout,
is_concat=False)
self.pos_ffn = PositionwiseFeedForward(
d_model,
d_inner,
filter_size=filter_size,
padding=padding,
dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
"""
@ -27,7 +61,8 @@ class FFTBlock(dg.Layer):
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
"""
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
output, slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask
output = self.pos_ffn(output)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import parakeet.models.fastspeech.utils
@ -6,19 +19,23 @@ import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D
class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor(input_size=input_size,
out_channels=out_channels,
filter_size=filter_size,
dropout=dropout)
self.duration_predictor = DurationPredictor(
input_size=input_size,
out_channels=out_channels,
filter_size=filter_size,
dropout=dropout)
def LR(self, x, duration_predictor_output, alpha=1.0):
output = []
batch_size = x.shape[0]
for i in range(batch_size):
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
output.append(
self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
alpha))
output = self.pad(output)
return output
@ -27,8 +44,8 @@ class LengthRegulator(dg.Layer):
out_list = []
for i in range(len(input_ele)):
pad_len = max_len - input_ele[i].shape[0]
one_batch_padded = layers.pad(
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
pad_value=0.0)
out_list.append(one_batch_padded)
out_padded = layers.stack(out_list)
return out_padded
@ -37,17 +54,16 @@ class LengthRegulator(dg.Layer):
out = []
time_steps = batch.shape[1]
fertilities = predicted.numpy()
batch = layers.squeeze(batch,[0])
batch = layers.squeeze(batch, [0])
for i in range(time_steps):
if fertilities[0,i]==0:
if fertilities[0, i] == 0:
continue
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
out.append(
layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
out = layers.concat(out, axis=0)
return out
def forward(self, x, alpha=1.0, target=None):
"""
Length Regulator block in FastSpeech.
@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
else:
duration_predictor_output = layers.round(duration_predictor_output)
output = self.LR(x, duration_predictor_output, alpha)
mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
mel_pos = layers.unsqueeze(mel_pos, [0])
return output, mel_pos
class DurationPredictor(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(DurationPredictor, self).__init__()
@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
self.dropout = dropout
k = math.sqrt(1 / self.input_size)
self.conv1 = Conv1D(num_channels = self.input_size,
num_filters = self.out_channels,
filter_size = self.filter_size,
padding=1,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
#data_format='NTC')
self.conv1 = Conv1D(
num_channels=self.input_size,
num_filters=self.out_channels,
filter_size=self.filter_size,
padding=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
k = math.sqrt(1 / self.out_channels)
self.conv2 = Conv1D(num_channels = self.out_channels,
num_filters = self.out_channels,
filter_size = self.filter_size,
padding=1,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
#data_format='NTC')
self.conv2 = Conv1D(
num_channels=self.out_channels,
num_filters=self.out_channels,
filter_size=self.filter_size,
padding=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / self.out_channels)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
bias_attr = self.bias)
self.linear = dg.Linear(
self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
def forward(self, encoder_output):
"""
@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
out (Variable), Shape(B, T, C), the output of duration predictor.
"""
# encoder_output.shape(N, T, C)
out = layers.transpose(encoder_output, [0,2,1])
out = layers.transpose(encoder_output, [0, 2, 1])
out = self.conv1(out)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
out = self.conv2(out)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1])
return out

View File

@ -1,5 +1,19 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
def get_alignment(attn_probs, mel_lens, n_head):
max_F = 0
assert attn_probs[0].shape[0] % n_head == 0
@ -8,7 +22,7 @@ def get_alignment(attn_probs, mel_lens, n_head):
for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy()
for j in range(n_head):
attn = multi_attn[j*batch_size:(j+1)*batch_size]
attn = multi_attn[j * batch_size:(j + 1) * batch_size]
F = score_F(attn)
if max_F < F:
max_F = F
@ -16,19 +30,19 @@ def get_alignment(attn_probs, mel_lens, n_head):
alignment = compute_duration(max_attn, mel_lens)
return alignment
def score_F(attn):
max = np.max(attn, axis=-1)
mean = np.mean(max)
return mean
def compute_duration(attn, mel_lens):
alignment = np.zeros([attn.shape[0],attn.shape[2]])
alignment = np.zeros([attn.shape[0], attn.shape[2]])
mel_lens = mel_lens.numpy()
for i in range(attn.shape[0]):
for j in range(mel_lens[i]):
max_index = np.argmax(attn[i,j])
alignment[i,max_index] += 1
max_index = np.argmax(attn[i, j])
alignment[i, max_index] += 1
return alignment

View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
from parakeet.modules.dynamic_gru import DynamicGRU
import numpy as np
class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False):
def __init__(self,
hidden_size,
batch_size,
K=16,
projection_size=256,
num_gru_layers=2,
max_pool_kernel_size=2,
is_post=False):
super(CBHG, self).__init__()
"""
:param hidden_size: dimension of hidden unit
@ -24,28 +44,39 @@ class CBHG(dg.Layer):
self.projection_size = projection_size
self.conv_list = []
k = math.sqrt(1 / projection_size)
self.conv_list.append(Conv1D(num_channels = projection_size,
num_filters = hidden_size,
filter_size = 1,
padding = int(np.floor(1/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
self.conv_list.append(
Conv1D(
num_channels=projection_size,
num_filters=hidden_size,
filter_size=1,
padding=int(np.floor(1 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
k = math.sqrt(1 / hidden_size)
for i in range(2,K+1):
self.conv_list.append(Conv1D(num_channels = hidden_size,
num_filters = hidden_size,
filter_size = i,
padding = int(np.floor(i/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
for i in range(2, K + 1):
self.conv_list.append(
Conv1D(
num_channels=hidden_size,
num_filters=hidden_size,
filter_size=i,
padding=int(np.floor(i / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
data_layout='NCHW'))
self.batchnorm_list.append(
dg.BatchNorm(
hidden_size, data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
@ -53,68 +84,94 @@ class CBHG(dg.Layer):
conv_outdim = hidden_size * K
k = math.sqrt(1 / conv_outdim)
self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
num_filters = hidden_size,
filter_size = 3,
padding = int(np.floor(3/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
self.conv_projection_1 = Conv1D(
num_channels=conv_outdim,
num_filters=hidden_size,
filter_size=3,
padding=int(np.floor(3 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size)
self.conv_projection_2 = Conv1D(num_channels = hidden_size,
num_filters = projection_size,
filter_size = 3,
padding = int(np.floor(3/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
self.conv_projection_2 = Conv1D(
num_channels=hidden_size,
num_filters=projection_size,
filter_size=3,
padding=int(np.floor(3 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(
projection_size, data_layout='NCHW')
self.max_pool = Pool1D(
pool_size=max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format="NCT")
self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
k = math.sqrt(1 / hidden_size)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward1 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse1 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward1 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse1 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:,:,:-1]
return x[:, :, :-1]
else:
return x
@ -124,20 +181,23 @@ class CBHG(dg.Layer):
conv_list = []
conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
for i, (conv, batchnorm
) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
conv_pool = self.max_pool(conv_cat)[:, :, :-1]
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
conv_proj = layers.relu(
self.batchnorm_proj_1(
self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1])
highway = layers.transpose(conv_proj, [0, 2, 1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
@ -151,9 +211,10 @@ class CBHG(dg.Layer):
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
return out
class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__()
@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
self.linears = []
k = math.sqrt(1 / num_units)
for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
self.gates.append(dg.Linear(num_units, num_units,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
self.linears.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
self.gates.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
out = h * t_ + out * c
return out

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
@ -7,48 +20,83 @@ from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.alpha = self.create_parameter(
shape=(1, ),
attr=param,
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(
value=1.0))
self.pos_inp = get_sinusoid_encoding_table(
1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(
input_size=config['audio']['num_mels'],
hidden_size=num_hidden * 2,
output_size=num_hidden,
dropout_rate=0.2)
k = math.sqrt(1 / num_hidden)
self.linear = dg.Linear(num_hidden, num_hidden,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.linear = dg.Linear(
num_hidden,
num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
self.selfattn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
self.attn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
self.ffns = [
PositionwiseFeedForward(
num_hidden, num_hidden * num_head, filter_size=1)
for _ in range(3)
]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.stop_linear = dg.Linear(num_hidden, 1,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.mel_linear = dg.Linear(
num_hidden,
config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.stop_linear = dg.Linear(
num_hidden,
1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn = True)
self.postconvnet = PostConvNet(
config['audio']['num_mels'],
config['hidden_size'],
filter_size=5,
padding=4,
num_conv=5,
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn=True)
def forward(self, key, value, query, c_mask, positional):
@ -56,15 +104,20 @@ class Decoder(dg.Layer):
if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
query)
triu_tensor = dg.to_variable(
get_triu_tensor(query.numpy(), query.numpy())).astype(
np.float32)
mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
zero_mask = get_attn_key_pad_mask(
layers.squeeze(c_mask, [-1]), query)
else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = get_triu_tensor(query.numpy(),
query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
@ -85,9 +138,12 @@ class Decoder(dg.Layer):
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
self.ffns):
query, attn_dec = selfattn(
query, query, query, mask=mask, query_mask=m_mask)
query, attn_dot = attn(
key, value, query, mask=zero_mask, query_mask=m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=True)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=1.0))
self.alpha = self.create_parameter(
shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(
1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(
embedding_size=embedding_size,
num_hidden=num_hidden,
use_cudnn=True)
self.layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
self.ffns = [
PositionwiseFeedForward(
num_hidden,
num_hidden * num_head,
filter_size=1,
use_cudnn=True) for _ in range(3)
]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
@ -35,14 +64,12 @@ class Encoder(dg.Layer):
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
@ -50,7 +77,7 @@ class Encoder(dg.Layer):
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
x = ffn(x)
attentions.append(attention)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
padding_idx = None)
self.embedding = dg.Embedding(
size=[len(symbols), embedding_size], padding_idx=None)
self.conv_list = []
k = math.sqrt(1 / embedding_size)
self.conv_list.append(Conv1D(num_channels = embedding_size,
num_filters = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=embedding_size,
num_filters=num_hidden,
filter_size=5,
padding=int(np.floor(5 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1 / num_hidden)
for _ in range(2):
self.conv_list.append(Conv1D(num_channels = num_hidden,
num_filters = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=num_hidden,
filter_size=5,
padding=int(np.floor(5 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
data_layout='NCHW') for _ in range(3)]
self.batch_norm_list = [
dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(3)
]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
k = math.sqrt(1 / num_hidden)
self.projection = dg.Linear(num_hidden, num_hidden,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.projection = dg.Linear(
num_hidden,
num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x, [0, 2, 1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
x = self.projection(x)
return x

View File

@ -1,9 +1,23 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.customized import Conv1D
class PostConvNet(dg.Layer):
def __init__(self,
n_mels=80,
@ -22,44 +36,61 @@ class PostConvNet(dg.Layer):
self.batchnorm_last = batchnorm_last
self.conv_list = []
k = math.sqrt(1 / (n_mels * outputs_per_step))
self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
num_filters = num_hidden,
filter_size = filter_size,
padding = padding,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=n_mels * outputs_per_step,
num_filters=num_hidden,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1 / num_hidden)
for _ in range(1, num_conv-1):
self.conv_list.append(Conv1D(num_channels = num_hidden,
num_filters = num_hidden,
filter_size = filter_size,
padding = padding,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
for _ in range(1, num_conv - 1):
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=num_hidden,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
self.conv_list.append(Conv1D(num_channels = num_hidden,
num_filters = n_mels * outputs_per_step,
filter_size = filter_size,
padding = padding,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=n_mels * outputs_per_step,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
data_layout='NCHW') for _ in range(num_conv-1)]
self.batch_norm_list = [
dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
]
if self.batchnorm_last:
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
data_layout='NCHW'))
self.batch_norm_list.append(
dg.BatchNorm(
n_mels * outputs_per_step, data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
def forward(self, input):
"""
Post Conv Net.
@ -70,17 +101,18 @@ class PostConvNet(dg.Layer):
output (Variable), Shape(B, T, C), the result after postconvnet.
"""
input = layers.transpose(input, [0,2,1])
input = layers.transpose(input, [0, 2, 1])
len = input.shape[-1]
for i in range(self.num_conv-1):
for i in range(self.num_conv - 1):
batch_norm = self.batch_norm_list[i]
conv = self.conv_list[i]
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
conv = self.conv_list[self.num_conv-1]
input = conv(input)[:,:,:len]
input = layers.dropout(
layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
conv = self.conv_list[self.num_conv - 1]
input = conv(input)[:, :, :len]
if self.batchnorm_last:
batch_norm = self.batch_norm_list[self.num_conv-1]
batch_norm = self.batch_norm_list[self.num_conv - 1]
input = layers.dropout(batch_norm(input), self.dropout)
output = layers.transpose(input, [0,2,1])
output = layers.transpose(input, [0, 2, 1])
return output

View File

@ -1,8 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
"""
@ -17,13 +31,21 @@ class PreNet(dg.Layer):
self.dropout_rate = dropout_rate
k = math.sqrt(1 / input_size)
self.linear1 = dg.Linear(input_size, hidden_size,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.linear1 = dg.Linear(
input_size,
hidden_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.linear2 = dg.Linear(
hidden_size,
output_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x):
"""

View File

@ -1,8 +1,22 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.encoder import Encoder
from parakeet.models.transformer_tts.decoder import Decoder
class TransformerTTS(dg.Layer):
def __init__(self, config):
super(TransformerTTS, self).__init__()
@ -14,13 +28,7 @@ class TransformerTTS(dg.Layer):
key, c_mask, attns_enc = self.encoder(characters, pos_text)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import librosa
import os, copy
@ -6,14 +19,15 @@ import paddle.fluid.layers as layers
def get_positional_table(d_pos_vec, n_position=1024):
position_enc = np.array([
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc = np.array(
[[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table = np.array(
[get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
return sinusoid_table
def get_non_pad_mask(seq):
return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
def get_attn_key_pad_mask(seq_k, seq_q):
''' For masking out the padding part of key sequence. '''
@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
# Expand to fit the shape of key query attention matrix.
len_q = seq_q.shape[1]
padding_mask = (seq_k != 0).astype(np.float32)
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1])
padding_mask = layers.expand(
layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
return padding_mask
def get_triu_tensor(seq_k, seq_q):
''' For make a triu tensor '''
len_k = seq_k.shape[1]
len_q = seq_q.shape[1]
batch_size = seq_k.shape[0]
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
triu_tensor = np.repeat(
np.expand_dims(
triu_tensor, axis=0), batch_size, axis=0)
return triu_tensor
def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32)
for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
**2 / (2 * g * g))
return W
def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
output = -1 * label * layers.log(input + epsilon) - (
1 - label) * layers.log(1 - input + epsilon)
output = output * (label * (position_weight - 1) + 1)
return layers.reduce_sum(output, dim=[0, 1])

View File

@ -1,27 +1,44 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D
from parakeet.models.transformer_tts.utils import *
from parakeet.models.transformer_tts.cbhg import CBHG
class Vocoder(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, config, batch_size):
super(Vocoder, self).__init__()
self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'],
num_filters = config['hidden_size'],
filter_size=1)
self.pre_proj = Conv1D(
num_channels=config['audio']['num_mels'],
num_filters=config['hidden_size'],
filter_size=1)
self.cbhg = CBHG(config['hidden_size'], batch_size)
self.post_proj = Conv1D(num_channels = config['hidden_size'],
num_filters = (config['audio']['n_fft'] // 2) + 1,
filter_size=1)
self.post_proj = Conv1D(
num_channels=config['hidden_size'],
num_filters=(config['audio']['n_fft'] // 2) + 1,
filter_size=1)
def forward(self, mel):
mel = layers.transpose(mel, [0,2,1])
mel = layers.transpose(mel, [0, 2, 1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1])
mag_pred = layers.transpose(mag_pred, [0, 2, 1])
return mag_pred

View File

@ -1 +1,15 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.waveflow.waveflow import WaveFlow

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import librosa

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import time

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import numpy as np
import paddle.fluid.dygraph as dg

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import librosa
@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
self.fft_window_shift = config.fft_window_shift
# Calculate context frames.
frames_per_second = config.sample_rate // self.fft_window_shift
train_clip_frames = int(np.ceil(
config.train_clip_second * frames_per_second))
train_clip_frames = int(
np.ceil(config.train_clip_second * frames_per_second))
context_frames = config.context_size // self.fft_window_shift
self.num_frames = train_clip_frames + context_frames
@ -53,12 +67,16 @@ class Dataset(ljspeech.LJSpeech):
# Compute mel-spectrogram.
# Turn center to False to prevent internal padding.
spectrogram = librosa.core.stft(
audio, hop_length=fft_window_shift,
win_length=fft_window_size, n_fft=fft_size, center=False)
audio,
hop_length=fft_window_shift,
win_length=fft_window_size,
n_fft=fft_size,
center=False)
spectrogram_magnitude = np.abs(spectrogram)
# Compute mel-spectrograms.
mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size,
mel_filter_bank = librosa.filters.mel(sr=sr,
n_fft=fft_size,
n_mels=config.mel_bands)
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
mel_spectrogram = mel_spectrogram.T
@ -70,7 +88,7 @@ class Dataset(ljspeech.LJSpeech):
mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
# Extract the center of audio that corresponds to mel spectrograms.
audio = audio[fft_padding : -fft_padding]
audio = audio[fft_padding:-fft_padding]
assert mel_spectrogram.shape[0] * fft_window_shift == audio.size
return audio, mel_spectrogram
@ -101,7 +119,7 @@ class Subset(dataset.Dataset):
audio_start = frame_start * fft_window_shift
audio_end = frame_end * fft_window_shift
audio = audio[audio_start : audio_end]
audio = audio[audio_start:audio_end]
return audio, mel, audio_start
@ -141,14 +159,14 @@ class LJSpeech:
sampler = DistributedSampler(len(trainset), nranks, rank)
total_bs = config.batch_size
assert total_bs % nranks == 0
train_sampler = BatchSampler(sampler, total_bs // nranks,
drop_last=True)
train_sampler = BatchSampler(
sampler, total_bs // nranks, drop_last=True)
trainloader = DataCargo(trainset, batch_sampler=train_sampler)
trainreader = fluid.io.PyReader(capacity=50, return_list=True)
trainreader.decorate_batch_generator(trainloader, place)
self.trainloader = (data for _ in iter(int, 1)
for data in trainreader())
for data in trainreader())
# Valid dataset.
validset = Subset(ds, valid_indices, valid=True)

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utility module for restarting training when using SLURM.
"""
@ -45,8 +58,8 @@ def parse_time(text):
try:
return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
except ValueError as e:
raise ValueError("Error parsing time {}. Got error {}.".format(
text, str(e)))
raise ValueError("Error parsing time {}. Got error {}.".format(text,
str(e)))
def restart_command():
@ -76,8 +89,10 @@ def restart_command():
gres, partition = info.get("Gres"), info.get("Partition")
stderr, stdout = info.get("StdErr"), info.get("StdOut")
job_name = info.get("JobName")
command = ["sbatch", "--job-name={}".format(job_name),
"--ntasks={}".format(num_tasks)]
command = [
"sbatch", "--job-name={}".format(job_name),
"--ntasks={}".format(num_tasks)
]
if partition:
command.extend(["--partition", partition])
@ -98,12 +113,13 @@ def restart_command():
dist_setting = ['-m', 'paddle.distributed.launch']
wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv
command.append(
"--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd)))
command.append("--wrap={}".format(" ".join(
shlex.quote(arg) for arg in wrap_cmd)))
time_limit_string = info["TimeLimit"]
if time_limit_string.lower() == "unlimited":
print("UNLIMITED detected: restart OFF, infinite learning ON.",
flush=True)
print(
"UNLIMITED detected: restart OFF, infinite learning ON.",
flush=True)
return command, None
time_limit = parse_time(time_limit_string)
runtime = parse_time(info["RunTime"])

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from pprint import pprint
@ -12,25 +26,42 @@ from wavenet import WaveNet
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='wavenet',
parser.add_argument(
'--model',
type=str,
default='wavenet',
help="general name of the model")
parser.add_argument('--name', type=str,
help="specific name of the training model")
parser.add_argument('--root', type=str,
help="root path of the LJSpeech dataset")
parser.add_argument(
'--name', type=str, help="specific name of the training model")
parser.add_argument(
'--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True,
parser.add_argument(
'--use_gpu',
type=bool,
default=True,
help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None,
parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load")
parser.add_argument('--output', type=str, default="./syn_audios",
parser.add_argument(
'--output',
type=str,
default="./syn_audios",
help="path to write synthesized audio files")
parser.add_argument('--sample', type=int,
parser.add_argument(
'--sample',
type=int,
help="which of the valid samples to synthesize audio")

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import subprocess
@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='wavenet',
parser.add_argument(
'--model',
type=str,
default='wavenet',
help="general name of the model")
parser.add_argument('--name', type=str,
help="specific name of the training model")
parser.add_argument('--root', type=str,
help="root path of the LJSpeech dataset")
parser.add_argument(
'--name', type=str, help="specific name of the training model")
parser.add_argument(
'--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--parallel', type=bool, default=True,
parser.add_argument(
'--parallel',
type=bool,
default=True,
help="option to use data parallel training")
parser.add_argument('--use_gpu', type=bool, default=True,
parser.add_argument(
'--use_gpu',
type=bool,
default=True,
help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None,
parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load")
parser.add_argument('--slurm', type=bool, default=False,
parser.add_argument(
'--slurm',
type=bool,
default=False,
help="whether you are using slurm to submit training jobs")
@ -104,8 +136,8 @@ def train(config):
# Check whether reaching the time limit.
if config.slurm:
done = (death_time is not None and death_time - time.time() <
MAXIMUM_SAVE_TIME)
done = (death_time is not None and
death_time - time.time() < MAXIMUM_SAVE_TIME)
if rank == 0 and done:
print("Saving progress before exiting.")
@ -127,8 +159,8 @@ def train(config):
if __name__ == "__main__":
# Create parser.
parser = jsonargparse.ArgumentParser(description="Train WaveNet model",
formatter_class='default_argparse')
parser = jsonargparse.ArgumentParser(
description="Train WaveNet model", formatter_class='default_argparse')
add_options_to_parser(parser)
utils.add_config_options_to_parser(parser)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import time
@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg
def add_config_options_to_parser(parser):
parser.add_argument('--valid_size', type=int,
help="size of the valid dataset")
parser.add_argument('--train_clip_second', type=float,
parser.add_argument(
'--valid_size', type=int, help="size of the valid dataset")
parser.add_argument(
'--train_clip_second',
type=float,
help="the length of audio clip for training")
parser.add_argument('--sample_rate', type=int,
help="sampling rate of audio data file")
parser.add_argument('--fft_window_shift', type=int,
parser.add_argument(
'--sample_rate', type=int, help="sampling rate of audio data file")
parser.add_argument(
'--fft_window_shift',
type=int,
help="the shift of fft window for each frame")
parser.add_argument('--fft_window_size', type=int,
parser.add_argument(
'--fft_window_size',
type=int,
help="the size of fft window for each frame")
parser.add_argument('--fft_size', type=int,
help="the size of fft filter on each frame")
parser.add_argument('--mel_bands', type=int,
parser.add_argument(
'--fft_size', type=int, help="the size of fft filter on each frame")
parser.add_argument(
'--mel_bands',
type=int,
help="the number of mel bands when calculating mel spectrograms")
parser.add_argument('--seed', type=int,
help="seed of random initialization for the model")
parser.add_argument('--batch_size', type=int,
help="batch size for training")
parser.add_argument('--test_every', type=int,
help="test interval during training")
parser.add_argument('--save_every', type=int,
parser.add_argument(
'--seed', type=int, help="seed of random initialization for the model")
parser.add_argument(
'--batch_size', type=int, help="batch size for training")
parser.add_argument(
'--test_every', type=int, help="test interval during training")
parser.add_argument(
'--save_every',
type=int,
help="checkpointing interval during training")
parser.add_argument('--max_iterations', type=int,
help="maximum training iterations")
parser.add_argument(
'--max_iterations', type=int, help="maximum training iterations")
parser.add_argument('--layers', type=int,
help="number of dilated convolution layers")
parser.add_argument('--kernel_width', type=int,
help="dilated convolution kernel width")
parser.add_argument('--dilation_block', type=list,
help="dilated convolution kernel width")
parser.add_argument(
'--layers', type=int, help="number of dilated convolution layers")
parser.add_argument(
'--kernel_width', type=int, help="dilated convolution kernel width")
parser.add_argument(
'--dilation_block', type=list, help="dilated convolution kernel width")
parser.add_argument('--residual_channels', type=int)
parser.add_argument('--skip_channels', type=int)
parser.add_argument('--loss_type', type=str,
help="mix-gaussian-pdf or softmax")
parser.add_argument('--num_channels', type=int, default=None,
parser.add_argument(
'--loss_type', type=str, help="mix-gaussian-pdf or softmax")
parser.add_argument(
'--num_channels',
type=int,
default=None,
help="number of channels for softmax output")
parser.add_argument('--num_mixtures', type=int, default=None,
parser.add_argument(
'--num_mixtures',
type=int,
default=None,
help="number of gaussian mixtures for gaussian output")
parser.add_argument('--log_scale_min', type=float, default=None,
parser.add_argument(
'--log_scale_min',
type=float,
default=None,
help="minimum clip value of log variance of gaussian output")
parser.add_argument('--conditioner.filter_sizes', type=list,
parser.add_argument(
'--conditioner.filter_sizes',
type=list,
help="conv2d tranpose op filter sizes for building conditioner")
parser.add_argument('--conditioner.upsample_factors', type=list,
parser.add_argument(
'--conditioner.upsample_factors',
type=list,
help="list of upsample factors for building conditioner")
parser.add_argument('--learning_rate', type=float)
parser.add_argument('--gradient_max_norm', type=float)
parser.add_argument('--anneal.every', type=int,
parser.add_argument(
'--anneal.every',
type=int,
help="step interval for annealing learning rate")
parser.add_argument('--anneal.rate', type=float)
@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
handle.write("model_checkpoint_path: step-{}".format(iteration))
def load_parameters(checkpoint_dir, rank, model, optimizer=None,
iteration=None, file_path=None):
def load_parameters(checkpoint_dir,
rank,
model,
optimizer=None,
iteration=None,
file_path=None):
if file_path is None:
if iteration is None:
iteration = load_latest_checkpoint(checkpoint_dir, rank)
@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
if optimizer and optimizer_dict:
optimizer.set_dict(optimizer_dict)
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
rank, file_path))
rank, file_path))
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import time
@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule
class WaveNet():
def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
nranks=1, tb_logger=None):
def __init__(self,
config,
checkpoint_dir,
parallel=False,
rank=0,
nranks=1,
tb_logger=None):
# Process config to calculate the context size
dilations = list(
itertools.islice(
@ -45,9 +64,9 @@ class WaveNet():
if training:
# Create Learning rate scheduler.
lr_scheduler = dg.ExponentialDecay(
learning_rate = config.learning_rate,
decay_steps = config.anneal.every,
decay_rate = config.anneal.rate,
learning_rate=config.learning_rate,
decay_steps=config.anneal.every,
decay_rate=config.anneal.rate,
staircase=True)
optimizer = fluid.optimizer.AdamOptimizer(
@ -57,10 +76,13 @@ class WaveNet():
config.gradient_max_norm)
# Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank,
wavenet, optimizer,
iteration=config.iteration,
file_path=config.checkpoint)
utils.load_parameters(
self.checkpoint_dir,
self.rank,
wavenet,
optimizer,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank))
# Data parallelism.
@ -74,9 +96,12 @@ class WaveNet():
else:
# Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank, wavenet,
iteration=config.iteration,
file_path=config.checkpoint)
utils.load_parameters(
self.checkpoint_dir,
self.rank,
wavenet,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank))
self.wavenet = wavenet
@ -104,7 +129,9 @@ class WaveNet():
else:
current_lr = self.optimizer._learning_rate
self.optimizer.minimize(loss, grad_clip=self.clipper,
self.optimizer.minimize(
loss,
grad_clip=self.clipper,
parameter_list=self.wavenet.parameters())
self.wavenet.clear_gradients()
@ -143,10 +170,16 @@ class WaveNet():
tb = self.tb_logger
tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(),
iteration, sample_rate=self.config.sample_rate)
tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
iteration, sample_rate=self.config.sample_rate)
tb.add_audio(
"Teacher-Forced-Audio-0",
sample_audios[0].numpy(),
iteration,
sample_rate=self.config.sample_rate)
tb.add_audio(
"Teacher-Forced-Audio-1",
sample_audios[1].numpy(),
iteration,
sample_rate=self.config.sample_rate)
@dg.no_grad
def infer(self, iteration):
@ -165,10 +198,9 @@ class WaveNet():
start_time = time.time()
syn_audio = self.wavenet.synthesize(mels_list[sample])
syn_time = time.time() - start_time
print("audio shape {}, synthesis time {}".format(
syn_audio.shape, syn_time))
librosa.output.write_wav(filename, syn_audio,
sr=config.sample_rate)
print("audio shape {}, synthesis time {}".format(syn_audio.shape,
syn_time))
librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate)
def save(self, iteration):
utils.save_latest_parameters(self.checkpoint_dir, iteration,

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import numpy as np
@ -20,7 +34,7 @@ def extract_slices(x, audio_starts, audio_length, rank):
start = audio_starts.numpy()[i]
end = start + audio_length
slice = fluid.layers.slice(
x, axes=[0, 1], starts=[i, start], ends=[i+1, end])
x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
slices.append(fluid.layers.squeeze(slice, [0]))
x = fluid.layers.stack(slices, axis=0)
@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer):
embed_dim=config.residual_channels,
std=0.1)
elif config.loss_type == "mix-gaussian-pdf":
self.embedding_fc = modules.FC(
self.full_name(),
in_features=1,
size=config.residual_channels,
num_flatten_dims=2,
relu=False)
self.embedding_fc = modules.FC(self.full_name(),
in_features=1,
size=config.residual_channels,
num_flatten_dims=2,
relu=False)
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(loss_type))
self.dilated_causal_convs = []
for dilation in self.dilations:
@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer):
num_filters=config.residual_channels,
filter_size=config.kernel_width,
dilation=dilation,
causal=True
)
)
causal=True))
for i, layer in enumerate(self.dilated_causal_convs):
self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
self.fc1 = modules.FC(
self.full_name(),
in_features=config.residual_channels,
size=config.skip_channels,
num_flatten_dims=2,
relu=True,
act="relu")
self.fc1 = modules.FC(self.full_name(),
in_features=config.residual_channels,
size=config.skip_channels,
num_flatten_dims=2,
relu=True,
act="relu")
self.fc2 = modules.FC(
self.full_name(),
in_features=config.skip_channels,
size=config.skip_channels,
num_flatten_dims=2,
relu=True,
act="relu")
self.fc2 = modules.FC(self.full_name(),
in_features=config.skip_channels,
size=config.skip_channels,
num_flatten_dims=2,
relu=True,
act="relu")
if config.loss_type == "softmax":
self.fc3 = modules.FC(
self.full_name(),
in_features=config.skip_channels,
size=config.num_channels,
num_flatten_dims=2,
relu=False)
self.fc3 = modules.FC(self.full_name(),
in_features=config.skip_channels,
size=config.num_channels,
num_flatten_dims=2,
relu=False)
elif config.loss_type == "mix-gaussian-pdf":
self.fc3 = modules.FC(
self.full_name(),
in_features=config.skip_channels,
size=3 * config.num_mixtures,
num_flatten_dims=2,
relu=False)
self.fc3 = modules.FC(self.full_name(),
in_features=config.skip_channels,
size=3 * config.num_mixtures,
num_flatten_dims=2,
relu=False)
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(loss_type))
def sample_softmax(self, mix_parameters):
batch, length, hidden = mix_parameters.shape
mix_param_2d = fluid.layers.reshape(mix_parameters,
[batch * length, hidden])
[batch * length, hidden])
mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)
# quantized: [batch * length]
quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),
dtype="float32")
quantized = fluid.layers.cast(
fluid.layers.sampling_id(mix_param_2d), dtype="float32")
samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0
# samples: [batch * length]
@ -162,13 +167,13 @@ class WaveNetModule(dg.Layer):
# to [bs * len, 3 * num_mixtures].
batch, length, hidden = mix_parameters.shape
mix_param_2d = fluid.layers.reshape(mix_parameters,
[batch * length, hidden])
[batch * length, hidden])
K = hidden // 3
# Unpack the parameters of the mixture of gaussian.
logits_pi = mix_param_2d[:, 0 : K]
mu = mix_param_2d[:, K : 2*K]
log_s = mix_param_2d[:, 2*K : 3*K]
logits_pi = mix_param_2d[:, 0:K]
mu = mix_param_2d[:, K:2 * K]
log_s = mix_param_2d[:, 2 * K:3 * K]
s = fluid.layers.exp(log_s)
pi = fluid.layers.softmax(logits_pi, axis=-1)
@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer):
# Calculate gaussian loss.
targets = fluid.layers.unsqueeze(targets, -1)
targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
x_std = inv_s * (targets - mu)
targets = fluid.layers.expand(targets,
[1, 1, self.config.num_mixtures])
x_std = inv_s * (targets - mu)
exponent = fluid.layers.exp(-0.5 * x_std * x_std)
pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
pdf_x = pi * pdf_x
@ -239,8 +245,8 @@ class WaveNetModule(dg.Layer):
# Slice conditioners.
audio_length = audios.shape[1]
conditioner = extract_slices(full_conditioner,
audio_starts, audio_length, self.rank)
conditioner = extract_slices(full_conditioner, audio_starts,
audio_length, self.rank)
# input_audio, target_audio: [bs, len]
input_audios = audios[:, :-1]
@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer):
layer_input = self.embedding_fc(
fluid.layers.unsqueeze(input_audios, 2))
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(loss_type))
# layer_input: [bs, res_channel, 1, len]
layer_input = fluid.layers.unsqueeze(
fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
fluid.layers.transpose(
layer_input, perm=[0, 2, 1]), 2)
# conditioner: [bs, mel_bands, 1, len]
conditioner = fluid.layers.unsqueeze(
fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
fluid.layers.transpose(
conditioner, perm=[0, 2, 1]), 2)
skip = None
for i, layer in enumerate(self.dilated_causal_convs):
@ -292,17 +299,16 @@ class WaveNetModule(dg.Layer):
elif loss_type == "mix-gaussian-pdf":
sample_audios = self.sample_mix_gaussian(mix_parameters)
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(
loss_type))
if loss_type == "softmax":
loss = self.softmax_loss(target_audios, mix_parameters)
elif loss_type == "mix-gaussian-pdf":
loss = self.mixture_density_loss(target_audios,
mix_parameters, self.log_scale_min)
loss = self.mixture_density_loss(target_audios, mix_parameters,
self.log_scale_min)
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(loss_type))
return loss, sample_audios
@ -335,22 +341,23 @@ class WaveNetModule(dg.Layer):
elif loss_type == "mix-gaussian-pdf":
audio_input = self.embedding_fc(current_sample)
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(
loss_type))
# [bs, channel, 1, 1]
audio_input = fluid.layers.unsqueeze(
fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
fluid.layers.transpose(
audio_input, perm=[0, 2, 1]), 2)
# [bs, mel_bands]
cond_input = conditioner[:, i, :]
# [bs, mel_bands, 1, 1]
cond_input = fluid.layers.reshape(
cond_input, cond_input.shape + [1, 1])
cond_input = fluid.layers.reshape(cond_input,
cond_input.shape + [1, 1])
skip = None
for layer in self.dilated_causal_convs:
audio_input, skip = layer.add_input(
audio_input, skip, cond_input)
audio_input, skip = layer.add_input(audio_input, skip,
cond_input)
# [bs, 1, channel]
skip = fluid.layers.transpose(
@ -361,14 +368,14 @@ class WaveNetModule(dg.Layer):
elif loss_type == "mix-gaussian-pdf":
sample = self.sample_mix_gaussian(mix_parameters)
else:
raise ValueError(
"loss_type {} is unsupported!".format(loss_type))
raise ValueError("loss_type {} is unsupported!".format(
loss_type))
audio_samples.append(sample)
# [bs]
current_sample = audio_samples[-1]
# [bs, 1, 1]
current_sample = fluid.layers.reshape(current_sample,
current_sample.shape + [1, 1])
current_sample = fluid.layers.reshape(
current_sample, current_sample.shape + [1, 1])
# syn_audio: [num_samples]
syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()

View File

@ -1,2 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import weight_norm
from .customized import *

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import fluid
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
@ -7,6 +21,7 @@ class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
pool_size=-1,
pool_type='max',
@ -28,12 +43,15 @@ class Pool1D(dg.Layer):
self.exclusive = exclusive
self.data_format = data_format
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive)
self.pool2d = dg.Pool2D(
[1, pool_size],
pool_type=pool_type,
pool_stride=[1, pool_stride],
pool_padding=[0, pool_padding],
global_pooling=global_pooling,
use_cudnn=use_cudnn,
ceil_mode=ceil_mode,
exclusive=exclusive)
def forward(self, x):
"""
@ -53,12 +71,14 @@ class Pool1D(dg.Layer):
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Conv1D(dg.Conv2D):
"""A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and
use (B, C, 1, T) data layout to compute 1D convolution. Nothing more.
NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple
layer, instead of a complex one. So we can easily apply weight norm to it.
"""
def __init__(self,
num_channels,
num_filters,
@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D):
use_cudnn=True,
act=None,
dtype='float32'):
super(Conv1D, self).__init__(num_channels,
num_filters, (1, filter_size),
stride=(1, stride),
padding=(0, padding),
dilation=(1, dilation),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
super(Conv1D, self).__init__(
num_channels,
num_filters, (1, filter_size),
stride=(1, stride),
padding=(0, padding),
dilation=(1, dilation),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
x = F.unsqueeze(x, [2])
@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose):
use_cudnn=True,
act=None,
dtype='float32'):
super(Conv1DTranspose, self).__init__(num_channels,
num_filters, (1, filter_size),
output_size=None,
padding=(0, padding),
stride=(1, stride),
dilation=(1, dilation),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
super(Conv1DTranspose, self).__init__(
num_channels,
num_filters, (1, filter_size),
output_size=None,
padding=(0, padding),
stride=(1, stride),
dilation=(1, dilation),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
x = F.unsqueeze(x, [2])
@ -134,6 +156,7 @@ class Conv1DCell(Conv1D):
It is a cell that it acts like an RNN cell. It does not support stride > 1, and it
ensures 1-to-1 mapping from input time steps to output timesteps.
"""
def __init__(self,
num_channels,
num_filters,
@ -150,18 +173,19 @@ class Conv1DCell(Conv1D):
padding = receptive_field - 1 if causal else receptive_field // 2
self._receptive_field = receptive_field
self.causal = causal
super(Conv1DCell, self).__init__(num_channels,
num_filters,
filter_size,
stride=1,
padding=padding,
dilation=dilation,
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
super(Conv1DCell, self).__init__(
num_channels,
num_filters,
filter_size,
stride=1,
padding=padding,
dilation=dilation,
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
# it ensures that ouput time steps == input time steps
@ -189,15 +213,16 @@ class Conv1DCell(Conv1D):
def add_input(self, x_t):
batch_size, c_in, _ = x_t.shape
if self._buffer is None:
self._buffer = F.zeros((batch_size, c_in, self.receptive_field),
dtype=x_t.dtype)
self._buffer = F.zeros(
(batch_size, c_in, self.receptive_field), dtype=x_t.dtype)
self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1)
if self._dilation[1] > 1:
input = F.strided_slice(self._buffer,
axes=[2],
starts=[0],
ends=[self.receptive_field],
strides=[self._dilation[1]])
input = F.strided_slice(
self._buffer,
axes=[2],
starts=[0],
ends=[self.receptive_field],
strides=[self._dilation[1]])
else:
input = self._buffer
input = F.reshape(input, (batch_size, -1))

View File

@ -1,6 +1,20 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class DynamicGRU(dg.Layer):
def __init__(self,
size,
@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer):
res = res[::-1]
res = layers.concat(res, axis=1)
return res

View File

@ -1,3 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D
class PositionwiseFeedForward(dg.Layer):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
def __init__(self,
d_in,
num_hidden,
filter_size,
padding=0,
use_cudnn=True,
dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.dropout = dropout
k = math.sqrt(1 / d_in)
self.w_1 = Conv1D(num_channels = d_in,
num_filters = num_hidden,
filter_size = filter_size,
padding=padding,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn)
self.w_1 = Conv1D(
num_channels=d_in,
num_filters=num_hidden,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn)
k = math.sqrt(1 / num_hidden)
self.w_2 = Conv1D(num_channels = num_hidden,
num_filters = d_in,
filter_size = filter_size,
padding=padding,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn)
self.w_2 = Conv1D(
num_channels=num_hidden,
num_filters=d_in,
filter_size=filter_size,
padding=padding,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn)
self.layer_norm = dg.LayerNorm(d_in)
def forward(self, input):
@ -40,14 +66,14 @@ class PositionwiseFeedForward(dg.Layer):
Returns:
output (Variable), Shape(B, T, C), the result after FFN.
"""
x = layers.transpose(input, [0,2,1])
x = layers.transpose(input, [0, 2, 1])
#FFN Networt
x = self.w_2(layers.relu(self.w_1(x)))
# dropout
x = layers.dropout(x, self.dropout)
x = layers.transpose(x, [0,2,1])
x = layers.transpose(x, [0, 2, 1])
# residual connection
x = x + input

View File

@ -1,29 +1,53 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class Linear(dg.Layer):
def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
def __init__(self,
in_features,
out_features,
is_bias=True,
dtype="float32"):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.dtype = dtype
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.bias = is_bias
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
self.bias = is_bias
if is_bias is not False:
k = math.sqrt(1 / in_features)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
bias_attr = self.bias,)
self.linear = dg.Linear(
in_features,
out_features,
param_attr=self.weight,
bias_attr=self.bias, )
def forward(self, x):
x = self.linear(x)
return x
class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key):
super(ScaledDotProductAttention, self).__init__()
@ -31,7 +55,13 @@ class ScaledDotProductAttention(dg.Layer):
self.d_key = d_key
# please attention this mask is diff from pytorch
def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
def forward(self,
key,
value,
query,
mask=None,
query_mask=None,
dropout=0.1):
"""
Scaled Dot Product Attention.
@ -47,13 +77,14 @@ class ScaledDotProductAttention(dg.Layer):
attention (Variable), Shape(n_head * B, T, C), the attention of key.
"""
# Compute attention score
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
attention = layers.matmul(
query, key, transpose_y=True) #transpose the last dim in y
attention = attention / math.sqrt(self.d_key)
# Mask key to ignore padding
if mask is not None:
attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
attention = attention + mask
attention = layers.softmax(attention)
@ -66,8 +97,16 @@ class ScaledDotProductAttention(dg.Layer):
result = layers.matmul(attention, value)
return result, attention
class MultiheadAttention(dg.Layer):
def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True):
def __init__(self,
num_hidden,
d_k,
d_q,
num_head=4,
is_bias=False,
dropout=0.1,
is_concat=True):
super(MultiheadAttention, self).__init__()
self.num_hidden = num_hidden
self.num_head = num_head
@ -109,28 +148,42 @@ class MultiheadAttention(dg.Layer):
# repeat masks h times
if query_mask is not None:
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
query_mask = layers.expand(query_mask,
[self.num_head, 1, seq_len_key])
if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
key = layers.reshape(
self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
value = layers.reshape(
self.value(value),
[batch_size, seq_len_key, self.num_head, self.d_k])
query = layers.reshape(
self.query(query_input),
[batch_size, seq_len_query, self.num_head, self.d_q])
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
key = layers.reshape(
layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(
layers.transpose(value, [2, 0, 1, 3]),
[-1, seq_len_key, self.d_k])
query = layers.reshape(
layers.transpose(query, [2, 0, 1, 3]),
[-1, seq_len_query, self.d_q])
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
result, attention = self.scal_attn(
key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.reshape(
result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(
layers.transpose(result, [1, 2, 0, 3]),
[batch_size, seq_len_query, -1])
if self.is_concat:
result = layers.concat([query_input,result], axis=-1)
result = layers.concat([query_input, result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
import paddle.fluid.dygraph as dg

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from torch import nn
import paddle.fluid.dygraph as dg
@ -10,8 +24,8 @@ def summary(layer):
print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
num_elements += np.prod(param.shape)
num_params += 1
print("layer has {} parameters, {} elements.".format(
num_params, num_elements))
print("layer has {} parameters, {} elements.".format(num_params,
num_elements))
def freeze(layer):
@ -31,5 +45,5 @@ def torch_summary(layer):
print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
num_elements += np.prod(param.shape)
num_params += 1
print("layer has {} parameters, {} elements.".format(
num_params, num_elements))
print("layer has {} parameters, {} elements.".format(num_params,
num_elements))

View File

@ -1,13 +1,27 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import re
from setuptools import setup, find_packages
def read(*names, **kwargs):
with io.open(
os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8")
) as fp:
os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8")) as fp:
return fp.read()
@ -19,6 +33,7 @@ def find_version(*file_paths):
return version_match.group(1)
raise RuntimeError("Unable to find version string.")
VERSION = find_version('parakeet', '__init__.py')
long_description = read('README.md')
@ -32,17 +47,26 @@ setup_info = dict(
description='Speech synthesis tools and models based on Paddlepaddle',
long_description=long_description,
license='Apache 2',
install_requires=[
'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba',
'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy',
'ruamel.yaml', 'pandas', 'sox', 'soundfile',
'numpy',
'nltk',
'inflect',
'librosa',
'unidecode',
'numba',
'tqdm',
'matplotlib',
'tensorboardX',
'tensorboard',
'scipy',
'ruamel.yaml',
'pandas',
'sox',
'soundfile',
],
# Package info
packages=find_packages(exclude=('tests', 'tests.*')),
zip_safe=True,
)
zip_safe=True, )
setup(**setup_info)

View File

@ -1,3 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets.ljspeech import LJSpeech
from parakeet.data.datacargo import DataCargo

View File

@ -1,11 +1,25 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets import vctk
from pathlib import Path
from parakeet.data.datacargo import DataCargo
root = Path("/workspace/datasets/VCTK-Corpus")
vctk_dataset = vctk.VCTK(root)
vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
vctk_cargo = DataCargo(
vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
for i, batch in enumerate(vctk_cargo):
print(i)

121
tools/copyright.hook Normal file
View File

@ -0,0 +1,121 @@
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import io, re
import sys, os
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())