add license
This commit is contained in:
parent
f84d6bec91
commit
9d79699432
|
@ -25,3 +25,11 @@
|
|||
files: \.md$
|
||||
- id: remove-tabs
|
||||
files: \.md$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: copyright_checker
|
||||
name: copyright_checker
|
||||
entry: python ./tools/copyright.hook
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
|
||||
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Deepvoice 3
|
||||
# Deepvoice 3
|
||||
|
||||
Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
|
||||
|
||||
|
@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
|
|||
## Project Structure
|
||||
|
||||
```text
|
||||
├── data.py data_processing
|
||||
├── data.py data_processing
|
||||
├── ljspeech.yaml (example) configuration file
|
||||
├── sentences.txt sample sentences
|
||||
├── synthesis.py script to synthesize waveform from text
|
||||
|
@ -50,7 +50,7 @@ optional arguments:
|
|||
The directory to save result.
|
||||
-g DEVICE, --device DEVICE
|
||||
device to use
|
||||
```
|
||||
```
|
||||
|
||||
1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
|
||||
2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
|
||||
|
@ -61,7 +61,7 @@ optional arguments:
|
|||
├── checkpoints # checkpoint
|
||||
├── log # tensorboard log
|
||||
└── states # train and evaluation results
|
||||
├── alignments # attention
|
||||
├── alignments # attention
|
||||
├── lin_spec # linear spectrogram
|
||||
├── mel_spec # mel spectrogram
|
||||
└── waveform # waveform (.wav files)
|
||||
|
@ -112,4 +112,3 @@ example script:
|
|||
```bash
|
||||
python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
|
||||
```
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
@ -79,10 +93,11 @@ class Transform(object):
|
|||
y = signal.lfilter([1., -self.preemphasis], [1.], wav)
|
||||
|
||||
# STFT
|
||||
D = librosa.stft(y=y,
|
||||
n_fft=self.n_fft,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
D = librosa.stft(
|
||||
y=y,
|
||||
n_fft=self.n_fft,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
S = np.abs(D)
|
||||
|
||||
# to db and normalize to 0-1
|
||||
|
@ -96,11 +111,8 @@ class Transform(object):
|
|||
|
||||
# mel scale and to db and normalize to 0-1,
|
||||
# CAUTION: pass linear scale S, not dbscaled S
|
||||
S_mel = librosa.feature.melspectrogram(S=S,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.fmin,
|
||||
fmax=self.fmax,
|
||||
power=1.)
|
||||
S_mel = librosa.feature.melspectrogram(
|
||||
S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
|
||||
S_mel = 20 * np.log10(np.maximum(amplitude_min,
|
||||
S_mel)) - self.ref_level_db
|
||||
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
|
||||
|
@ -148,20 +160,18 @@ class DataCollector(object):
|
|||
(mix_grapheme_phonemes, text_length, speaker_id, S_norm,
|
||||
S_mel_norm, num_frames) = example
|
||||
text_sequences.append(
|
||||
np.pad(mix_grapheme_phonemes,
|
||||
(0, max_text_length - text_length)))
|
||||
np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
|
||||
)))
|
||||
lin_specs.append(
|
||||
np.pad(S_norm,
|
||||
((0, 0), (self._pad_begin,
|
||||
max_frames - self._pad_begin - num_frames))))
|
||||
np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
|
||||
self._pad_begin - num_frames))))
|
||||
mel_specs.append(
|
||||
np.pad(S_mel_norm,
|
||||
((0, 0), (self._pad_begin,
|
||||
max_frames - self._pad_begin - num_frames))))
|
||||
np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
|
||||
self._pad_begin - num_frames))))
|
||||
done_flags.append(
|
||||
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
|
||||
(0, max_decoder_length -
|
||||
int(np.ceil(num_frames // self._factor))),
|
||||
(0, max_decoder_length - int(
|
||||
np.ceil(num_frames // self._factor))),
|
||||
constant_values=1))
|
||||
text_sequences = np.array(text_sequences).astype(np.int64)
|
||||
lin_specs = np.transpose(np.array(lin_specs),
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import ruamel.yaml
|
||||
|
@ -22,11 +36,8 @@ if __name__ == "__main__":
|
|||
parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
|
||||
parser.add_argument("text", type=str, help="text file to synthesize")
|
||||
parser.add_argument("output_path", type=str, help="path to save results")
|
||||
parser.add_argument("-g",
|
||||
"--device",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="device to use")
|
||||
parser.add_argument(
|
||||
"-g", "--device", type=int, default=-1, help="device to use")
|
||||
|
||||
args = parser.parse_args()
|
||||
with open(args.config, 'rt') as f:
|
||||
|
@ -76,15 +87,14 @@ if __name__ == "__main__":
|
|||
window_ahead = model_config["window_ahead"]
|
||||
key_projection = model_config["key_projection"]
|
||||
value_projection = model_config["value_projection"]
|
||||
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
||||
padding_idx, embedding_std, max_positions, n_vocab,
|
||||
freeze_embedding, filter_size, encoder_channels,
|
||||
n_mels, decoder_channels, r,
|
||||
trainable_positional_encodings, use_memory_mask,
|
||||
query_position_rate, key_position_rate,
|
||||
window_backward, window_ahead, key_projection,
|
||||
value_projection, downsample_factor, linear_dim,
|
||||
use_decoder_states, converter_channels, dropout)
|
||||
dv3 = make_model(
|
||||
n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
|
||||
embedding_std, max_positions, n_vocab, freeze_embedding,
|
||||
filter_size, encoder_channels, n_mels, decoder_channels, r,
|
||||
trainable_positional_encodings, use_memory_mask,
|
||||
query_position_rate, key_position_rate, window_backward,
|
||||
window_ahead, key_projection, value_projection, downsample_factor,
|
||||
linear_dim, use_decoder_states, converter_channels, dropout)
|
||||
|
||||
summary(dv3)
|
||||
state, _ = dg.load_dygraph(args.checkpoint)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import ruamel.yaml
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from matplotlib import cm
|
||||
|
@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
|||
converter_channels, dropout):
|
||||
"""just a simple function to create a deepvoice 3 model"""
|
||||
if n_speakers > 1:
|
||||
spe = dg.Embedding((n_speakers, speaker_dim),
|
||||
param_attr=I.Normal(scale=speaker_embed_std))
|
||||
spe = dg.Embedding(
|
||||
(n_speakers, speaker_dim),
|
||||
param_attr=I.Normal(scale=speaker_embed_std))
|
||||
else:
|
||||
spe = None
|
||||
|
||||
|
@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
|||
ConvSpec(h, k, 9),
|
||||
ConvSpec(h, k, 27),
|
||||
ConvSpec(h, k, 1),
|
||||
ConvSpec(h, k, 3),
|
||||
)
|
||||
enc = Encoder(n_vocab,
|
||||
embed_dim,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
padding_idx=None,
|
||||
embedding_weight_std=embedding_std,
|
||||
convolutions=encoder_convolutions,
|
||||
max_positions=max_positions,
|
||||
dropout=dropout)
|
||||
ConvSpec(h, k, 3), )
|
||||
enc = Encoder(
|
||||
n_vocab,
|
||||
embed_dim,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
padding_idx=None,
|
||||
embedding_weight_std=embedding_std,
|
||||
convolutions=encoder_convolutions,
|
||||
max_positions=max_positions,
|
||||
dropout=dropout)
|
||||
if freeze_embedding:
|
||||
freeze(enc.embed)
|
||||
|
||||
|
@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
|||
ConvSpec(h, k, 3),
|
||||
ConvSpec(h, k, 9),
|
||||
ConvSpec(h, k, 27),
|
||||
ConvSpec(h, k, 1),
|
||||
)
|
||||
ConvSpec(h, k, 1), )
|
||||
attention = [True, False, False, False, True]
|
||||
force_monotonic_attention = [True, False, False, False, True]
|
||||
dec = Decoder(n_speakers,
|
||||
speaker_dim,
|
||||
embed_dim,
|
||||
mel_dim,
|
||||
r=r,
|
||||
max_positions=max_positions,
|
||||
padding_idx=padding_idx,
|
||||
preattention=prenet_convolutions,
|
||||
convolutions=attentive_convolutions,
|
||||
attention=attention,
|
||||
dropout=dropout,
|
||||
use_memory_mask=use_memory_mask,
|
||||
force_monotonic_attention=force_monotonic_attention,
|
||||
query_position_rate=query_position_rate,
|
||||
key_position_rate=key_position_rate,
|
||||
window_range=WindowRange(window_behind, window_ahead),
|
||||
key_projection=key_projection,
|
||||
value_projection=value_projection)
|
||||
dec = Decoder(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
embed_dim,
|
||||
mel_dim,
|
||||
r=r,
|
||||
max_positions=max_positions,
|
||||
padding_idx=padding_idx,
|
||||
preattention=prenet_convolutions,
|
||||
convolutions=attentive_convolutions,
|
||||
attention=attention,
|
||||
dropout=dropout,
|
||||
use_memory_mask=use_memory_mask,
|
||||
force_monotonic_attention=force_monotonic_attention,
|
||||
query_position_rate=query_position_rate,
|
||||
key_position_rate=key_position_rate,
|
||||
window_range=WindowRange(window_behind, window_ahead),
|
||||
key_projection=key_projection,
|
||||
value_projection=value_projection)
|
||||
if not trainable_positional_encodings:
|
||||
freeze(dec.embed_keys_positions)
|
||||
freeze(dec.embed_query_positions)
|
||||
|
@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
|
|||
ConvSpec(h, k, 1),
|
||||
ConvSpec(h, k, 3),
|
||||
ConvSpec(2 * h, k, 1),
|
||||
ConvSpec(2 * h, k, 3),
|
||||
)
|
||||
cvt = Converter(n_speakers,
|
||||
speaker_dim,
|
||||
dec.state_dim if use_decoder_states else mel_dim,
|
||||
linear_dim,
|
||||
time_upsampling=downsample_factor,
|
||||
convolutions=postnet_convolutions,
|
||||
dropout=dropout)
|
||||
ConvSpec(2 * h, k, 3), )
|
||||
cvt = Converter(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
dec.state_dim if use_decoder_states else mel_dim,
|
||||
linear_dim,
|
||||
time_upsampling=downsample_factor,
|
||||
convolutions=postnet_convolutions,
|
||||
dropout=dropout)
|
||||
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
|
||||
return dv3
|
||||
|
||||
|
@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
|
|||
ref_level_db, power, n_iter, win_length, hop_length,
|
||||
preemphasis):
|
||||
"""generate waveform from text using a deepvoice 3 model"""
|
||||
text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
|
||||
dtype=np.int64)
|
||||
text = np.array(
|
||||
en.text_to_sequence(
|
||||
text, p=replace_pronounciation_prob),
|
||||
dtype=np.int64)
|
||||
length = len(text)
|
||||
print("text sequence's length: {}".format(length))
|
||||
text_positions = np.arange(1, 1 + length)
|
||||
|
@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
|
|||
"""
|
||||
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
|
||||
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
|
||||
wav = librosa.griffinlim(lin_scaled**power,
|
||||
n_iter=n_iter,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length)
|
||||
wav = librosa.griffinlim(
|
||||
lin_scaled**power,
|
||||
n_iter=n_iter,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length)
|
||||
if preemphasis > 0:
|
||||
wav = signal.lfilter([1.], [1., -preemphasis], wav)
|
||||
return wav
|
||||
|
@ -225,28 +243,30 @@ def save_state(save_dir,
|
|||
plt.colorbar()
|
||||
plt.title("mel_input")
|
||||
plt.savefig(
|
||||
os.path.join(path,
|
||||
"target_mel_spec_step{:09d}.png".format(global_step)))
|
||||
os.path.join(path, "target_mel_spec_step{:09d}.png".format(
|
||||
global_step)))
|
||||
plt.close()
|
||||
|
||||
writer.add_image("target/mel_spec",
|
||||
cm.viridis(mel_input),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
writer.add_image(
|
||||
"target/mel_spec",
|
||||
cm.viridis(mel_input),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
|
||||
plt.figure(figsize=(10, 3))
|
||||
display.specshow(mel_output)
|
||||
plt.colorbar()
|
||||
plt.title("mel_output")
|
||||
plt.savefig(
|
||||
os.path.join(
|
||||
path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
|
||||
os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
|
||||
global_step)))
|
||||
plt.close()
|
||||
|
||||
writer.add_image("predicted/mel_spec",
|
||||
cm.viridis(mel_output),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
writer.add_image(
|
||||
"predicted/mel_spec",
|
||||
cm.viridis(mel_output),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
|
||||
if lin_input is not None and lin_output is not None:
|
||||
lin_input = lin_input[0].numpy().T
|
||||
|
@ -258,28 +278,30 @@ def save_state(save_dir,
|
|||
plt.colorbar()
|
||||
plt.title("mel_input")
|
||||
plt.savefig(
|
||||
os.path.join(path,
|
||||
"target_lin_spec_step{:09d}.png".format(global_step)))
|
||||
os.path.join(path, "target_lin_spec_step{:09d}.png".format(
|
||||
global_step)))
|
||||
plt.close()
|
||||
|
||||
writer.add_image("target/lin_spec",
|
||||
cm.viridis(lin_input),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
writer.add_image(
|
||||
"target/lin_spec",
|
||||
cm.viridis(lin_input),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
|
||||
plt.figure(figsize=(10, 3))
|
||||
display.specshow(lin_output)
|
||||
plt.colorbar()
|
||||
plt.title("mel_input")
|
||||
plt.savefig(
|
||||
os.path.join(
|
||||
path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
|
||||
os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
|
||||
global_step)))
|
||||
plt.close()
|
||||
|
||||
writer.add_image("predicted/lin_spec",
|
||||
cm.viridis(lin_output),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
writer.add_image(
|
||||
"predicted/lin_spec",
|
||||
cm.viridis(lin_output),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
|
||||
if alignments is not None and len(alignments.shape) == 4:
|
||||
path = os.path.join(save_dir, "alignments")
|
||||
|
@ -290,10 +312,11 @@ def save_state(save_dir,
|
|||
"train_attn_layer_{}_step_{}.png".format(idx, global_step))
|
||||
plot_alignment(attn_layer, save_path)
|
||||
|
||||
writer.add_image("train_attn/layer_{}".format(idx),
|
||||
cm.viridis(attn_layer),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
writer.add_image(
|
||||
"train_attn/layer_{}".format(idx),
|
||||
cm.viridis(attn_layer),
|
||||
global_step,
|
||||
dataformats="HWC")
|
||||
|
||||
if lin_output is not None:
|
||||
wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
|
||||
|
@ -302,7 +325,5 @@ def save_state(save_dir,
|
|||
save_path = os.path.join(
|
||||
path, "train_sample_step_{:09d}.wav".format(global_step))
|
||||
sf.write(save_path, wav, sample_rate)
|
||||
writer.add_audio("train_sample",
|
||||
wav,
|
||||
global_step,
|
||||
sample_rate=sample_rate)
|
||||
writer.add_audio(
|
||||
"train_sample", wav, global_step, sample_rate=sample_rate)
|
||||
|
|
|
@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
|
|||
|
||||
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
|
||||
|
||||
For more help on arguments:
|
||||
For more help on arguments:
|
||||
``python train.py --help``.
|
||||
|
||||
## Synthesis
|
||||
|
@ -75,5 +75,5 @@ or you can run the script file directly.
|
|||
sh synthesis.sh
|
||||
```
|
||||
|
||||
For more help on arguments:
|
||||
For more help on arguments:
|
||||
``python synthesis.py --help``.
|
||||
|
|
|
@ -1,36 +1,90 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
default='config/fastspeech.yaml',
|
||||
help="the yaml config file path.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=32, help="batch size for training.")
|
||||
parser.add_argument(
|
||||
'--epochs',
|
||||
type=int,
|
||||
default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
parser.add_argument(
|
||||
'--lr',
|
||||
type=float,
|
||||
default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
parser.add_argument(
|
||||
'--save_step',
|
||||
type=int,
|
||||
default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--fastspeech_step', type=int, default=70000,
|
||||
parser.add_argument(
|
||||
'--fastspeech_step',
|
||||
type=int,
|
||||
default=70000,
|
||||
help="Global step to restore checkpoint of fastspeech.")
|
||||
parser.add_argument('--use_gpu', type=int, default=1,
|
||||
parser.add_argument(
|
||||
'--use_gpu',
|
||||
type=int,
|
||||
default=1,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=int, default=0,
|
||||
parser.add_argument(
|
||||
'--use_data_parallel',
|
||||
type=int,
|
||||
default=0,
|
||||
help="use data parallel or not during training.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
parser.add_argument(
|
||||
'--data_path',
|
||||
type=str,
|
||||
default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
parser.add_argument(
|
||||
'--checkpoint_path',
|
||||
type=str,
|
||||
default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
parser.add_argument(
|
||||
'--save_path',
|
||||
type=str,
|
||||
default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
parser.add_argument(
|
||||
'--log_dir',
|
||||
type=str,
|
||||
default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||
parser.add_argument(
|
||||
'--sample_path',
|
||||
type=str,
|
||||
default='./sample',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
parser.add_argument('--transtts_path', type=str, default='./log',
|
||||
parser.add_argument(
|
||||
'--transtts_path',
|
||||
type=str,
|
||||
default='./log',
|
||||
help="the directory to load pretrain transformerTTS model.")
|
||||
parser.add_argument('--transformer_step', type=int, default=160000,
|
||||
parser.add_argument(
|
||||
'--transformer_step',
|
||||
type=int,
|
||||
default=160000,
|
||||
help="the step to load transformerTTS model.")
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from tensorboardX import SummaryWriter
|
||||
from collections import OrderedDict
|
||||
|
@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
|
|||
from parakeet import audio
|
||||
from parakeet.models.fastspeech.fastspeech import FastSpeech
|
||||
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
new_state_dict = OrderedDict()
|
||||
|
@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict
|
||||
|
||||
|
||||
def synthesis(text_input, args):
|
||||
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
# tensorboard
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'synthesis')
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir, 'synthesis')
|
||||
|
||||
with open(args.config_path) as f:
|
||||
cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
@ -37,24 +52,28 @@ def synthesis(text_input, args):
|
|||
|
||||
with dg.guard(place):
|
||||
model = FastSpeech(cfg)
|
||||
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
|
||||
model.set_dict(
|
||||
load_checkpoint(
|
||||
str(args.fastspeech_step),
|
||||
os.path.join(args.checkpoint_path, "fastspeech")))
|
||||
model.eval()
|
||||
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
|
||||
pos_text = np.arange(1, text.shape[1]+1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
|
||||
pos_text = np.arange(1, text.shape[1] + 1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
|
||||
|
||||
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
|
||||
mel_output, mel_output_postnet = model(
|
||||
text, pos_text, alpha=args.alpha)
|
||||
|
||||
_ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=cfg['audio']['sr'],
|
||||
num_mels=cfg['audio']['num_mels'],
|
||||
min_level_db=cfg['audio']['min_level_db'],
|
||||
ref_level_db=cfg['audio']['ref_level_db'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length= cfg['audio']['win_length'],
|
||||
hop_length= cfg['audio']['hop_length'],
|
||||
sample_rate=cfg['audio']['sr'],
|
||||
num_mels=cfg['audio']['num_mels'],
|
||||
min_level_db=cfg['audio']['min_level_db'],
|
||||
ref_level_db=cfg['audio']['ref_level_db'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length=cfg['audio']['win_length'],
|
||||
hop_length=cfg['audio']['hop_length'],
|
||||
power=cfg['audio']['power'],
|
||||
preemphasis=cfg['audio']['preemphasis'],
|
||||
signal_norm=True,
|
||||
|
@ -67,14 +86,17 @@ def synthesis(text_input, args):
|
|||
do_trim_silence=False,
|
||||
sound_norm=False)
|
||||
|
||||
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
|
||||
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
|
||||
mel_output_postnet = fluid.layers.transpose(
|
||||
fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
|
||||
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
|
||||
))
|
||||
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
|
||||
print("Synthesis completed !!!")
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Train Fastspeech model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
synthesis("Transformer model is so fast!", args)
|
||||
synthesis("Transformer model is so fast!", args)
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
|
@ -20,8 +33,10 @@ import sys
|
|||
sys.path.append("../transformer_tts")
|
||||
from data import LJSpeechLoader
|
||||
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(
|
||||
os.path.join(model_path, step))
|
||||
new_state_dict = OrderedDict()
|
||||
for param in model_dict:
|
||||
if param.startswith('_layers.'):
|
||||
|
@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict, opti_dict
|
||||
|
||||
|
||||
def main(args):
|
||||
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
|
||||
|
@ -43,26 +59,33 @@ def main(args):
|
|||
if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'fastspeech')
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir, 'fastspeech')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
with fluid.unique_name.guard():
|
||||
transformerTTS = TransformerTTS(cfg)
|
||||
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
|
||||
model_dict, _ = load_checkpoint(
|
||||
str(args.transformer_step),
|
||||
os.path.join(args.transtts_path, "transformer"))
|
||||
transformerTTS.set_dict(model_dict)
|
||||
transformerTTS.eval()
|
||||
|
||||
model = FastSpeech(cfg)
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=dg.NoamDecay(1 / (
|
||||
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
reader = LJSpeechLoader(
|
||||
cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
|
||||
model_dict, opti_dict = load_checkpoint(
|
||||
str(args.fastspeech_step),
|
||||
os.path.join(args.checkpoint_path, "fastspeech"))
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
global_step = args.fastspeech_step
|
||||
|
@ -76,31 +99,42 @@ def main(args):
|
|||
pbar = tqdm(reader)
|
||||
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
pbar.set_description('Processing at epoch %d' % epoch)
|
||||
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
|
||||
|
||||
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
|
||||
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
|
||||
_, _, attn_probs, _, _, _ = transformerTTS(
|
||||
character, mel_input, pos_text, pos_mel)
|
||||
alignment = dg.to_variable(
|
||||
get_alignment(attn_probs, mel_lens, cfg[
|
||||
'transformer_head'])).astype(np.float32)
|
||||
|
||||
global_step += 1
|
||||
|
||||
|
||||
#Forward
|
||||
result= model(character,
|
||||
pos_text,
|
||||
mel_pos=pos_mel,
|
||||
length_target=alignment)
|
||||
result = model(
|
||||
character,
|
||||
pos_text,
|
||||
mel_pos=pos_mel,
|
||||
length_target=alignment)
|
||||
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
|
||||
mel_loss = layers.mse_loss(mel_output, mel)
|
||||
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
|
||||
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
|
||||
duration_loss = layers.mean(
|
||||
layers.abs(
|
||||
layers.elementwise_sub(duration_predictor_output,
|
||||
alignment)))
|
||||
total_loss = mel_loss + mel_postnet_loss + duration_loss
|
||||
|
||||
if local_rank==0:
|
||||
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
|
||||
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
|
||||
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
|
||||
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||
|
||||
if local_rank == 0:
|
||||
writer.add_scalar('mel_loss',
|
||||
mel_loss.numpy(), global_step)
|
||||
writer.add_scalar('post_mel_loss',
|
||||
mel_postnet_loss.numpy(), global_step)
|
||||
writer.add_scalar('duration_loss',
|
||||
duration_loss.numpy(), global_step)
|
||||
writer.add_scalar('learning_rate',
|
||||
optimizer._learning_rate.step().numpy(),
|
||||
global_step)
|
||||
|
||||
if args.use_data_parallel:
|
||||
total_loss = model.scale_loss(total_loss)
|
||||
|
@ -108,21 +142,25 @@ def main(args):
|
|||
model.apply_collective_grads()
|
||||
else:
|
||||
total_loss.backward()
|
||||
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
|
||||
optimizer.minimize(
|
||||
total_loss,
|
||||
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
|
||||
'grad_clip_thresh']))
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank==0 and global_step % args.save_step == 0:
|
||||
# save checkpoint
|
||||
if local_rank == 0 and global_step % args.save_step == 0:
|
||||
if not os.path.exists(args.save_path):
|
||||
os.mkdir(args.save_path)
|
||||
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
|
||||
save_path = os.path.join(args.save_path,
|
||||
'fastspeech/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
if local_rank==0:
|
||||
if local_rank == 0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ =='__main__':
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Train Fastspeech model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
|
|||
|
||||
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
|
||||
|
||||
For more help on arguments:
|
||||
For more help on arguments:
|
||||
``python train_transformer.py --help``.
|
||||
|
||||
## Train Vocoder
|
||||
|
@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
|
|||
```
|
||||
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
|
||||
|
||||
For more help on arguments:
|
||||
For more help on arguments:
|
||||
``python train_vocoder.py --help``.
|
||||
|
||||
## Synthesis
|
||||
|
@ -101,5 +101,5 @@ sh synthesis.sh
|
|||
|
||||
And the audio file will be saved in ``--sample_path``.
|
||||
|
||||
For more help on arguments:
|
||||
For more help on arguments:
|
||||
``python synthesis.py --help``.
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
|
|||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
from parakeet.data.dataset import DatasetMixin, TransformDataset
|
||||
|
||||
|
||||
class LJSpeechLoader:
|
||||
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
|
||||
def __init__(self,
|
||||
config,
|
||||
args,
|
||||
nranks,
|
||||
rank,
|
||||
is_vocoder=False,
|
||||
shuffle=True):
|
||||
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
|
||||
|
||||
LJSPEECH_ROOT = Path(args.data_path)
|
||||
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
|
||||
transformer = LJSpeech(config)
|
||||
dataset = TransformDataset(metadata, transformer)
|
||||
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
|
||||
sampler = DistributedSampler(
|
||||
len(metadata), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert args.batch_size % nranks == 0
|
||||
each_bs = args.batch_size // nranks
|
||||
if is_vocoder:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
|
||||
dataloader = DataCargo(
|
||||
dataset,
|
||||
sampler=sampler,
|
||||
batch_size=each_bs,
|
||||
shuffle=shuffle,
|
||||
batch_fn=batch_examples_vocoder,
|
||||
drop_last=True)
|
||||
else:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
|
||||
|
||||
dataloader = DataCargo(
|
||||
dataset,
|
||||
sampler=sampler,
|
||||
batch_size=each_bs,
|
||||
shuffle=shuffle,
|
||||
batch_fn=batch_examples,
|
||||
drop_last=True)
|
||||
|
||||
self.reader = fluid.io.DataLoader.from_generator(
|
||||
capacity=32,
|
||||
iterable=True,
|
||||
|
@ -63,13 +96,13 @@ class LJSpeech(object):
|
|||
super(LJSpeech, self).__init__()
|
||||
self.config = config
|
||||
self._ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=config['audio']['sr'],
|
||||
num_mels=config['audio']['num_mels'],
|
||||
min_level_db=config['audio']['min_level_db'],
|
||||
ref_level_db=config['audio']['ref_level_db'],
|
||||
n_fft=config['audio']['n_fft'],
|
||||
win_length= config['audio']['win_length'],
|
||||
hop_length= config['audio']['hop_length'],
|
||||
sample_rate=config['audio']['sr'],
|
||||
num_mels=config['audio']['num_mels'],
|
||||
min_level_db=config['audio']['min_level_db'],
|
||||
ref_level_db=config['audio']['ref_level_db'],
|
||||
n_fft=config['audio']['n_fft'],
|
||||
win_length=config['audio']['win_length'],
|
||||
hop_length=config['audio']['hop_length'],
|
||||
power=config['audio']['power'],
|
||||
preemphasis=config['audio']['preemphasis'],
|
||||
signal_norm=True,
|
||||
|
@ -81,7 +114,7 @@ class LJSpeech(object):
|
|||
griffin_lim_iters=60,
|
||||
do_trim_silence=False,
|
||||
sound_norm=False)
|
||||
|
||||
|
||||
def __call__(self, metadatum):
|
||||
"""All the code for generating an Example from a metadatum. If you want a
|
||||
different preprocessing pipeline, you can override this method.
|
||||
|
@ -90,13 +123,15 @@ class LJSpeech(object):
|
|||
method.
|
||||
"""
|
||||
fname, raw_text, normalized_text = metadatum
|
||||
|
||||
|
||||
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||
wav = self._ljspeech_processor.load_wav(str(fname))
|
||||
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
|
||||
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
|
||||
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||
phonemes = np.array(
|
||||
g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mag, mel, phonemes
|
||||
) # maybe we need to implement it as a map in the future
|
||||
|
||||
|
||||
def batch_examples(batch):
|
||||
|
@ -109,44 +144,71 @@ def batch_examples(batch):
|
|||
pos_mels = []
|
||||
for data in batch:
|
||||
_, mel, text = data
|
||||
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
|
||||
mel_inputs.append(
|
||||
np.concatenate(
|
||||
[np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
|
||||
axis=-1))
|
||||
mel_lens.append(mel.shape[1])
|
||||
text_lens.append(len(text))
|
||||
pos_texts.append(np.arange(1, len(text) + 1))
|
||||
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mel_inputs = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mel_lens = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
pos_texts = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
pos_mels = [
|
||||
i
|
||||
for i, _ in sorted(
|
||||
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
text_lens = sorted(text_lens, reverse=True)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
|
||||
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
|
||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
|
||||
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||
mels = np.transpose(
|
||||
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
mel_inputs = np.transpose(
|
||||
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
|
||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
|
||||
np.array(mel_lens))
|
||||
|
||||
|
||||
def batch_examples_vocoder(batch):
|
||||
mels=[]
|
||||
mags=[]
|
||||
mels = []
|
||||
mags = []
|
||||
for data in batch:
|
||||
mag, mel, _ = data
|
||||
mels.append(mel)
|
||||
mags.append(mag)
|
||||
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
|
||||
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
|
||||
|
||||
return (mels, mags)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,38 +1,100 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
default='config/train_transformer.yaml',
|
||||
help="the yaml config file path.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=32, help="batch size for training.")
|
||||
parser.add_argument(
|
||||
'--epochs',
|
||||
type=int,
|
||||
default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
parser.add_argument(
|
||||
'--lr',
|
||||
type=float,
|
||||
default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
parser.add_argument(
|
||||
'--save_step',
|
||||
type=int,
|
||||
default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--image_step', type=int, default=2000,
|
||||
parser.add_argument(
|
||||
'--image_step',
|
||||
type=int,
|
||||
default=2000,
|
||||
help="attention image interval during training.")
|
||||
parser.add_argument('--max_len', type=int, default=400,
|
||||
parser.add_argument(
|
||||
'--max_len',
|
||||
type=int,
|
||||
default=400,
|
||||
help="The max length of audio when synthsis.")
|
||||
parser.add_argument('--transformer_step', type=int, default=160000,
|
||||
parser.add_argument(
|
||||
'--transformer_step',
|
||||
type=int,
|
||||
default=160000,
|
||||
help="Global step to restore checkpoint of transformer.")
|
||||
parser.add_argument('--vocoder_step', type=int, default=90000,
|
||||
parser.add_argument(
|
||||
'--vocoder_step',
|
||||
type=int,
|
||||
default=90000,
|
||||
help="Global step to restore checkpoint of postnet.")
|
||||
parser.add_argument('--use_gpu', type=int, default=1,
|
||||
parser.add_argument(
|
||||
'--use_gpu',
|
||||
type=int,
|
||||
default=1,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=int, default=0,
|
||||
parser.add_argument(
|
||||
'--use_data_parallel',
|
||||
type=int,
|
||||
default=0,
|
||||
help="use data parallel or not during training.")
|
||||
parser.add_argument('--stop_token', type=int, default=0,
|
||||
parser.add_argument(
|
||||
'--stop_token',
|
||||
type=int,
|
||||
default=0,
|
||||
help="use stop token loss in network or not.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
parser.add_argument(
|
||||
'--data_path',
|
||||
type=str,
|
||||
default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
parser.add_argument(
|
||||
'--checkpoint_path',
|
||||
type=str,
|
||||
default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
parser.add_argument(
|
||||
'--save_path',
|
||||
type=str,
|
||||
default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
parser.add_argument(
|
||||
'--log_dir',
|
||||
type=str,
|
||||
default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||
parser.add_argument(
|
||||
'--sample_path',
|
||||
type=str,
|
||||
default='./sample',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from scipy.io.wavfile import write
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
|
@ -16,6 +29,7 @@ from parakeet import audio
|
|||
from parakeet.models.transformer_tts.vocoder import Vocoder
|
||||
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
|
||||
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
new_state_dict = OrderedDict()
|
||||
|
@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict
|
||||
|
||||
|
||||
def synthesis(text_input, args):
|
||||
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
|
@ -34,46 +49,53 @@ def synthesis(text_input, args):
|
|||
|
||||
# tensorboard
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'synthesis')
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir, 'synthesis')
|
||||
|
||||
writer = SummaryWriter(path)
|
||||
|
||||
with dg.guard(place):
|
||||
with fluid.unique_name.guard():
|
||||
model = TransformerTTS(cfg)
|
||||
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
|
||||
model.set_dict(
|
||||
load_checkpoint(
|
||||
str(args.transformer_step),
|
||||
os.path.join(args.checkpoint_path, "transformer")))
|
||||
model.eval()
|
||||
|
||||
|
||||
with fluid.unique_name.guard():
|
||||
model_vocoder = Vocoder(cfg, args.batch_size)
|
||||
model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
|
||||
model_vocoder.set_dict(
|
||||
load_checkpoint(
|
||||
str(args.vocoder_step),
|
||||
os.path.join(args.checkpoint_path, "vocoder")))
|
||||
model_vocoder.eval()
|
||||
# init input
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
|
||||
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
|
||||
pos_text = np.arange(1, text.shape[1]+1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
||||
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
|
||||
mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
|
||||
pos_text = np.arange(1, text.shape[1] + 1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
|
||||
|
||||
pbar = tqdm(range(args.max_len))
|
||||
|
||||
for i in pbar:
|
||||
pos_mel = np.arange(1, mel_input.shape[1]+1)
|
||||
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
|
||||
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
|
||||
pos_mel = np.arange(1, mel_input.shape[1] + 1)
|
||||
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||
text, mel_input, pos_text, pos_mel)
|
||||
mel_input = fluid.layers.concat(
|
||||
[mel_input, postnet_pred[:, -1:, :]], axis=1)
|
||||
mag_pred = model_vocoder(postnet_pred)
|
||||
|
||||
_ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=cfg['audio']['sr'],
|
||||
num_mels=cfg['audio']['num_mels'],
|
||||
min_level_db=cfg['audio']['min_level_db'],
|
||||
ref_level_db=cfg['audio']['ref_level_db'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length= cfg['audio']['win_length'],
|
||||
hop_length= cfg['audio']['hop_length'],
|
||||
sample_rate=cfg['audio']['sr'],
|
||||
num_mels=cfg['audio']['num_mels'],
|
||||
min_level_db=cfg['audio']['min_level_db'],
|
||||
ref_level_db=cfg['audio']['ref_level_db'],
|
||||
n_fft=cfg['audio']['n_fft'],
|
||||
win_length=cfg['audio']['win_length'],
|
||||
hop_length=cfg['audio']['hop_length'],
|
||||
power=cfg['audio']['power'],
|
||||
preemphasis=cfg['audio']['preemphasis'],
|
||||
signal_norm=True,
|
||||
|
@ -86,13 +108,18 @@ def synthesis(text_input, args):
|
|||
do_trim_silence=False,
|
||||
sound_norm=False)
|
||||
|
||||
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
|
||||
wav = _ljspeech_processor.inv_spectrogram(
|
||||
fluid.layers.transpose(
|
||||
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
|
||||
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
|
||||
if not os.path.exists(args.sample_path):
|
||||
os.mkdir(args.sample_path)
|
||||
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
|
||||
write(
|
||||
os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
|
||||
wav)
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Synthesis model")
|
||||
add_config_options_to_parser(parser)
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
from tensorboardX import SummaryWriter
|
||||
|
@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
|
|||
from data import LJSpeechLoader
|
||||
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
|
||||
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(
|
||||
os.path.join(model_path, step))
|
||||
new_state_dict = OrderedDict()
|
||||
for param in model_dict:
|
||||
if param.startswith('_layers.'):
|
||||
|
@ -40,22 +55,27 @@ def main(args):
|
|||
if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'transformer')
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir, 'transformer')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
|
||||
with dg.guard(place):
|
||||
model = TransformerTTS(cfg)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=dg.NoamDecay(1 / (
|
||||
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
reader = LJSpeechLoader(
|
||||
cfg, args, nranks, local_rank, shuffle=True).reader()
|
||||
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
|
||||
model_dict, opti_dict = load_checkpoint(
|
||||
str(args.transformer_step),
|
||||
os.path.join(args.checkpoint_path, "transformer"))
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
global_step = args.transformer_step
|
||||
|
@ -64,86 +84,112 @@ def main(args):
|
|||
if args.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
|
||||
for epoch in range(args.epochs):
|
||||
pbar = tqdm(reader)
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
pbar.set_description('Processing at epoch %d' % epoch)
|
||||
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
|
||||
|
||||
global_step += 1
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
|
||||
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
|
||||
character, mel_input, pos_text, pos_mel)
|
||||
|
||||
label = (pos_mel == 0).astype(np.float32)
|
||||
|
||||
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||
|
||||
mel_loss = layers.mean(
|
||||
layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||
post_mel_loss = layers.mean(
|
||||
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||
loss = mel_loss + post_mel_loss
|
||||
# Note: When used stop token loss the learning did not work.
|
||||
if args.stop_token:
|
||||
stop_loss = cross_entropy(stop_preds, label)
|
||||
loss = loss + stop_loss
|
||||
|
||||
if local_rank==0:
|
||||
if local_rank == 0:
|
||||
writer.add_scalars('training_loss', {
|
||||
'mel_loss':mel_loss.numpy(),
|
||||
'post_mel_loss':post_mel_loss.numpy()
|
||||
'mel_loss': mel_loss.numpy(),
|
||||
'post_mel_loss': post_mel_loss.numpy()
|
||||
}, global_step)
|
||||
|
||||
if args.stop_token:
|
||||
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
|
||||
writer.add_scalar('stop_loss',
|
||||
stop_loss.numpy(), global_step)
|
||||
|
||||
if args.use_data_parallel:
|
||||
writer.add_scalars('alphas', {
|
||||
'encoder_alpha':model._layers.encoder.alpha.numpy(),
|
||||
'decoder_alpha':model._layers.decoder.alpha.numpy(),
|
||||
'encoder_alpha':
|
||||
model._layers.encoder.alpha.numpy(),
|
||||
'decoder_alpha':
|
||||
model._layers.decoder.alpha.numpy(),
|
||||
}, global_step)
|
||||
else:
|
||||
writer.add_scalars('alphas', {
|
||||
'encoder_alpha':model.encoder.alpha.numpy(),
|
||||
'decoder_alpha':model.decoder.alpha.numpy(),
|
||||
'encoder_alpha': model.encoder.alpha.numpy(),
|
||||
'decoder_alpha': model.decoder.alpha.numpy(),
|
||||
}, global_step)
|
||||
|
||||
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||
writer.add_scalar('learning_rate',
|
||||
optimizer._learning_rate.step().numpy(),
|
||||
global_step)
|
||||
|
||||
if global_step % args.image_step == 1:
|
||||
for i, prob in enumerate(attn_probs):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * 16]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j,
|
||||
dataformats="HWC")
|
||||
|
||||
for i, prob in enumerate(attn_enc):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * 16]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_enc_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j,
|
||||
dataformats="HWC")
|
||||
|
||||
for i, prob in enumerate(attn_dec):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
|
||||
x = np.uint8(
|
||||
cm.viridis(prob.numpy()[j * 16]) * 255)
|
||||
writer.add_image(
|
||||
'Attention_dec_%d_0' % global_step,
|
||||
x,
|
||||
i * 4 + j,
|
||||
dataformats="HWC")
|
||||
|
||||
if args.use_data_parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
|
||||
optimizer.minimize(
|
||||
loss,
|
||||
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
|
||||
'grad_clip_thresh']))
|
||||
model.clear_gradients()
|
||||
|
||||
|
||||
# save checkpoint
|
||||
if local_rank==0 and global_step % args.save_step == 0:
|
||||
if local_rank == 0 and global_step % args.save_step == 0:
|
||||
if not os.path.exists(args.save_path):
|
||||
os.mkdir(args.save_path)
|
||||
save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
|
||||
save_path = os.path.join(args.save_path,
|
||||
'transformer/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
if local_rank==0:
|
||||
if local_rank == 0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ =='__main__':
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Train TransformerTTS model")
|
||||
add_config_options_to_parser(parser)
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from tensorboardX import SummaryWriter
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
|
|||
from data import LJSpeechLoader
|
||||
from parakeet.models.transformer_tts.vocoder import Vocoder
|
||||
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
|
||||
new_state_dict = OrderedDict()
|
||||
|
@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
|
|||
new_state_dict[param] = model_dict[param]
|
||||
return new_state_dict, opti_dict
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
|
||||
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
|
||||
|
||||
|
@ -35,23 +50,26 @@ def main(args):
|
|||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if args.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if args.use_gpu else fluid.CPUPlace())
|
||||
|
||||
|
||||
if not os.path.exists(args.log_dir):
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir,'vocoder')
|
||||
os.mkdir(args.log_dir)
|
||||
path = os.path.join(args.log_dir, 'vocoder')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
with dg.guard(place):
|
||||
model = Vocoder(cfg, args.batch_size)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=dg.NoamDecay(1 / (
|
||||
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
if args.checkpoint_path is not None:
|
||||
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
|
||||
model_dict, opti_dict = load_checkpoint(
|
||||
str(args.vocoder_step),
|
||||
os.path.join(args.checkpoint_path, "vocoder"))
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
global_step = args.vocoder_step
|
||||
|
@ -61,48 +79,55 @@ def main(args):
|
|||
strategy = dg.parallel.prepare_context()
|
||||
model = fluid.dygraph.parallel.DataParallel(model, strategy)
|
||||
|
||||
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
|
||||
reader = LJSpeechLoader(
|
||||
cfg, args, nranks, local_rank, is_vocoder=True).reader()
|
||||
|
||||
for epoch in range(args.epochs):
|
||||
pbar = tqdm(reader)
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
pbar.set_description('Processing at epoch %d' % epoch)
|
||||
mel, mag = data
|
||||
mag = dg.to_variable(mag.numpy())
|
||||
mel = dg.to_variable(mel.numpy())
|
||||
global_step += 1
|
||||
|
||||
mag_pred = model(mel)
|
||||
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
|
||||
|
||||
loss = layers.mean(
|
||||
layers.abs(layers.elementwise_sub(mag_pred, mag)))
|
||||
|
||||
if args.use_data_parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
|
||||
optimizer.minimize(
|
||||
loss,
|
||||
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
|
||||
'grad_clip_thresh']))
|
||||
model.clear_gradients()
|
||||
|
||||
if local_rank==0:
|
||||
writer.add_scalars('training_loss',{
|
||||
'loss':loss.numpy(),
|
||||
|
||||
if local_rank == 0:
|
||||
writer.add_scalars('training_loss', {
|
||||
'loss': loss.numpy(),
|
||||
}, global_step)
|
||||
|
||||
if global_step % args.save_step == 0:
|
||||
if not os.path.exists(args.save_path):
|
||||
os.mkdir(args.save_path)
|
||||
save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
|
||||
save_path = os.path.join(args.save_path,
|
||||
'vocoder/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
|
||||
if local_rank==0:
|
||||
if local_rank == 0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Train vocoder model")
|
||||
add_config_options_to_parser(parser)
|
||||
args = parser.parse_args()
|
||||
# Print the whole config setting.
|
||||
pprint(args)
|
||||
main(args)
|
||||
main(args)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
from pprint import pprint
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
from pprint import pprint
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
__version__ = "0.0.0"
|
||||
|
||||
from . import data, g2p, models, modules
|
||||
|
|
|
@ -1 +1,15 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .audio import AudioProcessor
|
|
@ -1,30 +1,46 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import scipy.io
|
||||
import scipy.signal
|
||||
|
||||
|
||||
class AudioProcessor(object):
|
||||
def __init__(self,
|
||||
sample_rate=None, # int, sampling rate
|
||||
num_mels=None, # int, bands of mel spectrogram
|
||||
min_level_db=None, # float, minimum level db
|
||||
ref_level_db=None, # float, reference level db
|
||||
n_fft=None, # int: number of samples in a frame for stft
|
||||
win_length=None, # int: the same meaning with n_fft
|
||||
hop_length=None, # int: number of samples between neighboring frame
|
||||
power=None, # float:power to raise before griffin-lim
|
||||
preemphasis=None, # float: preemphasis coefficident
|
||||
signal_norm=None, #
|
||||
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
|
||||
max_norm=None, # float, max norm
|
||||
mel_fmin=None, # int: mel spectrogram's minimum frequency
|
||||
mel_fmax=None, # int: mel spectrogram's maximum frequency
|
||||
clip_norm=True, # bool: clip spectrogram's norm
|
||||
griffin_lim_iters=None, # int:
|
||||
do_trim_silence=False, # bool: trim silence
|
||||
sound_norm=False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate=None, # int, sampling rate
|
||||
num_mels=None, # int, bands of mel spectrogram
|
||||
min_level_db=None, # float, minimum level db
|
||||
ref_level_db=None, # float, reference level db
|
||||
n_fft=None, # int: number of samples in a frame for stft
|
||||
win_length=None, # int: the same meaning with n_fft
|
||||
hop_length=None, # int: number of samples between neighboring frame
|
||||
power=None, # float:power to raise before griffin-lim
|
||||
preemphasis=None, # float: preemphasis coefficident
|
||||
signal_norm=None, #
|
||||
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
|
||||
max_norm=None, # float, max norm
|
||||
mel_fmin=None, # int: mel spectrogram's minimum frequency
|
||||
mel_fmax=None, # int: mel spectrogram's maximum frequency
|
||||
clip_norm=True, # bool: clip spectrogram's norm
|
||||
griffin_lim_iters=None, # int:
|
||||
do_trim_silence=False, # bool: trim silence
|
||||
sound_norm=False,
|
||||
**kwargs):
|
||||
self.sample_rate = sample_rate
|
||||
self.num_mels = num_mels
|
||||
self.min_level_db = min_level_db
|
||||
|
@ -34,8 +50,8 @@ class AudioProcessor(object):
|
|||
self.n_fft = n_fft
|
||||
self.win_length = win_length or n_fft
|
||||
# hop length defaults to 1/4 window_length
|
||||
self.hop_length = hop_length or 0.25 * self.win_length
|
||||
|
||||
self.hop_length = hop_length or 0.25 * self.win_length
|
||||
|
||||
self.power = power
|
||||
self.preemphasis = float(preemphasis)
|
||||
|
||||
|
@ -52,7 +68,8 @@ class AudioProcessor(object):
|
|||
self.do_trim_silence = do_trim_silence
|
||||
|
||||
self.sound_norm = sound_norm
|
||||
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
|
||||
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
|
||||
)
|
||||
|
||||
def _stft_parameters(self):
|
||||
"""compute frame length and hop length in ms"""
|
||||
|
@ -65,44 +82,54 @@ class AudioProcessor(object):
|
|||
"""object repr"""
|
||||
cls_name_str = self.__class__.__name__
|
||||
members = vars(self)
|
||||
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()])
|
||||
dict_str = "\n".join(
|
||||
[" {}: {},".format(k, v) for k, v in members.items()])
|
||||
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
|
||||
return repr_str
|
||||
|
||||
def save_wav(self, path, wav):
|
||||
"""save audio with scipy.io.wavfile in 16bit integers"""
|
||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
|
||||
scipy.io.wavfile.write(path, self.sample_rate,
|
||||
wav_norm.as_type(np.int16))
|
||||
|
||||
def load_wav(self, path, sr=None):
|
||||
"""load wav -> trim_silence -> rescale"""
|
||||
|
||||
x, sr = librosa.load(path, sr=None)
|
||||
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
|
||||
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
|
||||
sr, self.sample_rate)
|
||||
if self.do_trim_silence:
|
||||
try:
|
||||
x = self.trim_silence(x)
|
||||
except ValueError:
|
||||
print(" [!] File cannot be trimmed for silence - {}".format(path))
|
||||
print(" [!] File cannot be trimmed for silence - {}".format(
|
||||
path))
|
||||
if self.sound_norm:
|
||||
x = x / x.max() * 0.9 # why 0.9 ?
|
||||
x = x / x.max() * 0.9 # why 0.9 ?
|
||||
return x
|
||||
|
||||
def trim_silence(self, wav):
|
||||
"""Trim soilent parts with a threshold and 0.01s margin"""
|
||||
margin = int(self.sample_rate * 0.01)
|
||||
wav = wav[margin: -margin]
|
||||
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
|
||||
wav = wav[margin:-margin]
|
||||
trimed_wav = librosa.effects.trim(
|
||||
wav,
|
||||
top_db=60,
|
||||
frame_length=self.win_length,
|
||||
hop_length=self.hop_length)[0]
|
||||
return trimed_wav
|
||||
|
||||
def apply_preemphasis(self, x):
|
||||
if self.preemphasis == 0.:
|
||||
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
|
||||
raise RuntimeError(
|
||||
" !! Preemphasis coefficient should be positive. ")
|
||||
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
|
||||
|
||||
def apply_inv_preemphasis(self, x):
|
||||
if self.preemphasis == 0.:
|
||||
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
|
||||
raise RuntimeError(
|
||||
" !! Preemphasis coefficient should be positive. ")
|
||||
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
|
||||
|
||||
def _amplitude_to_db(self, x):
|
||||
|
@ -125,12 +152,11 @@ class AudioProcessor(object):
|
|||
"""return mel basis for mel scale"""
|
||||
if self.mel_fmax is not None:
|
||||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
return librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax)
|
||||
return librosa.filters.mel(self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax)
|
||||
|
||||
def _normalize(self, S):
|
||||
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
|
||||
|
@ -156,25 +182,29 @@ class AudioProcessor(object):
|
|||
if self.symmetric_norm:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
|
||||
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
|
||||
S_denorm = (S_denorm + self.max_norm) * (
|
||||
-self.min_level_db) / (2 * self.max_norm
|
||||
) + self.min_level_db
|
||||
return S_denorm
|
||||
else:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, 0, self.max_norm)
|
||||
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
|
||||
S_denorm = S_denorm * (-self.min_level_db
|
||||
) / self.max_norm + self.min_level_db
|
||||
return S_denorm
|
||||
else:
|
||||
return S
|
||||
|
||||
def _stft(self, y):
|
||||
return librosa.stft(
|
||||
y=y,
|
||||
y=y,
|
||||
n_fft=self.n_fft,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
|
||||
def _istft(self, S):
|
||||
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
|
||||
return librosa.istft(
|
||||
S, hop_length=self.hop_length, win_length=self.win_length)
|
||||
|
||||
def spectrogram(self, y):
|
||||
"""compute linear spectrogram(amplitude)
|
||||
|
@ -195,7 +225,8 @@ class AudioProcessor(object):
|
|||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
|
||||
S = self._amplitude_to_db(self._linear_to_mel(np.abs(
|
||||
D))) - self.ref_level_db
|
||||
return self._normalize(S)
|
||||
|
||||
def inv_spectrogram(self, spectrogram):
|
||||
|
@ -203,16 +234,16 @@ class AudioProcessor(object):
|
|||
S = self._denormalize(spectrogram)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
if self.preemphasis:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||
return self._griffin_lim(S ** self.power)
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def inv_melspectrogram(self, mel_spectrogram):
|
||||
S = self._denormalize(mel_spectrogram)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
S = self._mel_to_linear(np.abs(S))
|
||||
if self.preemphasis:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||
return self._griffin_lim(S ** self.power)
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def out_linear_to_mel(self, linear_spec):
|
||||
"""convert output linear spec to mel spec"""
|
||||
|
@ -222,7 +253,7 @@ class AudioProcessor(object):
|
|||
S = self._amplitude_to_db(S) - self.ref_level_db
|
||||
mel = self._normalize(S)
|
||||
return mel
|
||||
|
||||
|
||||
def _griffin_lim(self, S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
|
@ -234,18 +265,18 @@ class AudioProcessor(object):
|
|||
|
||||
@staticmethod
|
||||
def mulaw_encode(wav, qc):
|
||||
mu = 2 ** qc - 1
|
||||
mu = 2**qc - 1
|
||||
# wav_abs = np.minimum(np.abs(wav), 1.0)
|
||||
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
|
||||
# Quantize signal to the specified number of levels.
|
||||
signal = (signal + 1) / 2 * mu + 0.5
|
||||
return np.floor(signal,)
|
||||
return np.floor(signal, )
|
||||
|
||||
@staticmethod
|
||||
def mulaw_decode(wav, qc):
|
||||
"""Recovers waveform from quantized values."""
|
||||
mu = 2 ** qc - 1
|
||||
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
|
||||
mu = 2**qc - 1
|
||||
x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .dataset import *
|
||||
from .datacargo import *
|
||||
from .sampler import *
|
||||
|
|
|
@ -1,18 +1,34 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
functions to make batch for arrays which satisfy some conditions.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TextIDBatcher(object):
|
||||
"""A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
|
||||
|
||||
def __init__(self, pad_id=0, dtype=np.int64):
|
||||
self.pad_id = pad_id
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
def __call__(self, minibatch):
|
||||
out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
|
||||
return out
|
||||
|
||||
|
||||
def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
|
||||
"""
|
||||
minibatch: List[Example]
|
||||
|
@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
|
|||
"""
|
||||
peek_example = minibatch[0]
|
||||
assert len(peek_example.shape) == 1, "text example is an 1D tensor"
|
||||
|
||||
lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
|
||||
|
||||
lengths = [example.shape[0] for example in minibatch
|
||||
] # assume (channel, n_samples) or (n_samples, )
|
||||
max_len = np.max(lengths)
|
||||
|
||||
|
||||
batch = []
|
||||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[0]
|
||||
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_id))
|
||||
|
||||
return np.array(batch, dtype=dtype)
|
||||
|
||||
|
||||
class WavBatcher(object):
|
||||
def __init__(self, pad_value=0., dtype=np.float32):
|
||||
self.pad_value = pad_value
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
def __call__(self, minibatch):
|
||||
out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
|
||||
return out
|
||||
|
||||
|
||||
def batch_wav(minibatch, pad_value=0., dtype=np.float32):
|
||||
"""
|
||||
minibatch: List[Example]
|
||||
|
@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
|
|||
mono_channel = True
|
||||
elif len(peek_example.shape) == 2:
|
||||
mono_channel = False
|
||||
|
||||
lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
|
||||
|
||||
lengths = [example.shape[-1] for example in minibatch
|
||||
] # assume (channel, n_samples) or (n_samples, )
|
||||
max_len = np.max(lengths)
|
||||
|
||||
|
||||
batch = []
|
||||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[-1]
|
||||
if mono_channel:
|
||||
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
|
||||
batch.append(
|
||||
np.pad(example, [(0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
else:
|
||||
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
|
||||
|
||||
batch.append(
|
||||
np.pad(example, [(0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value)) # what about PCM, no
|
||||
|
||||
return np.array(batch, dtype=dtype)
|
||||
|
||||
|
||||
|
@ -75,6 +104,7 @@ class SpecBatcher(object):
|
|||
out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
|
||||
return out
|
||||
|
||||
|
||||
def batch_spec(minibatch, pad_value=0., dtype=np.float32):
|
||||
"""
|
||||
minibatch: List[Example]
|
||||
|
@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
|
|||
mono_channel = True
|
||||
elif len(peek_example.shape) == 3:
|
||||
mono_channel = False
|
||||
|
||||
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
|
||||
max_len = np.max(lengths)
|
||||
|
||||
|
||||
lengths = [example.shape[-1] for example in minibatch
|
||||
] # assume (channel, F, n_frame) or (F, n_frame)
|
||||
max_len = np.max(lengths)
|
||||
|
||||
batch = []
|
||||
for example in minibatch:
|
||||
pad_len = max_len - example.shape[-1]
|
||||
if mono_channel:
|
||||
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
|
||||
batch.append(
|
||||
np.pad(example, [(0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value))
|
||||
else:
|
||||
batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
|
||||
|
||||
return np.array(batch, dtype=dtype)
|
||||
batch.append(
|
||||
np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
|
||||
mode='constant',
|
||||
constant_values=pad_value)) # what about PCM, no
|
||||
|
||||
return np.array(batch, dtype=dtype)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import six
|
||||
from .sampler import SequentialSampler, RandomSampler, BatchSampler
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import six
|
||||
import numpy as np
|
||||
|
||||
|
@ -9,8 +23,7 @@ class DatasetMixin(object):
|
|||
if isinstance(index, slice):
|
||||
start, stop, step = index.indices(len(self))
|
||||
return [
|
||||
self.get_example(i)
|
||||
for i in six.moves.range(start, stop, step)
|
||||
self.get_example(i) for i in six.moves.range(start, stop, step)
|
||||
]
|
||||
elif isinstance(index, (list, np.ndarray)):
|
||||
return [self.get_example(i) for i in index]
|
||||
|
@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
|
|||
|
||||
def get_example(self, i):
|
||||
if i < 0:
|
||||
raise IndexError(
|
||||
"ChainDataset doesnot support negative indexing.")
|
||||
raise IndexError("ChainDataset doesnot support negative indexing.")
|
||||
|
||||
for dataset in self._datasets:
|
||||
if i < len(dataset):
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
|
||||
|
||||
|
@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
|
|||
So the sampler is only responsible for generating valid indices.
|
||||
"""
|
||||
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
|
||||
class Sampler(object):
|
||||
def __init__(self, data_source):
|
||||
pass
|
||||
|
@ -23,7 +36,7 @@ class Sampler(object):
|
|||
class SequentialSampler(Sampler):
|
||||
def __init__(self, data_source):
|
||||
self.data_source = data_source
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
return iter(range(len(self.data_source)))
|
||||
|
||||
|
@ -42,12 +55,14 @@ class RandomSampler(Sampler):
|
|||
"replacement={}".format(self.replacement))
|
||||
|
||||
if self._num_samples is not None and not replacement:
|
||||
raise ValueError("With replacement=False, num_samples should not be specified, "
|
||||
"since a random permutation will be performed.")
|
||||
raise ValueError(
|
||||
"With replacement=False, num_samples should not be specified, "
|
||||
"since a random permutation will be performed.")
|
||||
|
||||
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
|
||||
raise ValueError("num_samples should be a positive integer "
|
||||
"value, but got num_samples={}".format(self.num_samples))
|
||||
"value, but got num_samples={}".format(
|
||||
self.num_samples))
|
||||
|
||||
@property
|
||||
def num_samples(self):
|
||||
|
@ -59,7 +74,9 @@ class RandomSampler(Sampler):
|
|||
def __iter__(self):
|
||||
n = len(self.data_source)
|
||||
if self.replacement:
|
||||
return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
|
||||
return iter(
|
||||
np.random.randint(
|
||||
0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
|
||||
return iter(np.random.permutation(n).tolist())
|
||||
|
||||
def __len__(self):
|
||||
|
@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
|
|||
self.indices = indices
|
||||
|
||||
def __iter__(self):
|
||||
return (self.indices[i] for i in np.random.permutation(len(self.indices)))
|
||||
return (self.indices[i]
|
||||
for i in np.random.permutation(len(self.indices)))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.indices)
|
||||
|
@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
|
|||
3. Permutate mini-batchs
|
||||
"""
|
||||
|
||||
def __init__(self, lengths, batch_size=4, batch_group_size=None,
|
||||
def __init__(self,
|
||||
lengths,
|
||||
batch_size=4,
|
||||
batch_group_size=None,
|
||||
permutate=True):
|
||||
_lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
|
||||
_lengths = np.array(
|
||||
lengths,
|
||||
dtype=np.int64) # maybe better implement length as a sort key
|
||||
self.lengths = np.sort(_lengths)
|
||||
self.sorted_indices = np.argsort(_lengths)
|
||||
|
||||
|
@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
|
|||
for i in range(len(indices) // batch_group_size):
|
||||
s = i * batch_group_size
|
||||
e = s + batch_group_size
|
||||
random.shuffle(indices[s: e]) # inplace
|
||||
random.shuffle(indices[s:e]) # inplace
|
||||
|
||||
# Permutate batches
|
||||
if self.permutate:
|
||||
perm = np.arange(len(indices[:e]) // self.batch_size)
|
||||
random.shuffle(perm)
|
||||
indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
|
||||
indices[:e] = indices[:e].reshape(
|
||||
-1, self.batch_size)[perm, :].reshape(-1)
|
||||
|
||||
# Handle last elements
|
||||
s += batch_group_size
|
||||
#print(indices)
|
||||
if s < len(indices):
|
||||
random.shuffle(indices[s:])
|
||||
|
||||
|
||||
return iter(indices)
|
||||
|
||||
def __len__(self):
|
||||
|
@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
|
|||
def __init__(self, weights, num_samples, replacement):
|
||||
if not isinstance(num_samples, int) or num_samples <= 0:
|
||||
raise ValueError("num_samples should be a positive integer "
|
||||
"value, but got num_samples={}".format(num_samples))
|
||||
"value, but got num_samples={}".format(
|
||||
num_samples))
|
||||
self.weights = np.array(weights, dtype=np.float64)
|
||||
self.num_samples = num_samples
|
||||
self.replacement = replacement
|
||||
|
||||
def __iter__(self):
|
||||
return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),
|
||||
replace=self.replacement, p=self.weights).tolist())
|
||||
return iter(
|
||||
np.random.choice(
|
||||
len(self.weights),
|
||||
size=(self.num_samples, ),
|
||||
replace=self.replacement,
|
||||
p=self.weights).tolist())
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples
|
||||
|
@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
|
|||
|
||||
# Subset samples for each trainer.
|
||||
indices = indices[self.rank:self.total_size:self.num_trainers]
|
||||
assert len(indices) == self.num_samples
|
||||
assert len(indices) == self.num_samples
|
||||
|
||||
return iter(indices)
|
||||
|
||||
|
@ -209,8 +238,7 @@ class BatchSampler(Sampler):
|
|||
def __init__(self, sampler, batch_size, drop_last):
|
||||
if not isinstance(sampler, Sampler):
|
||||
raise ValueError("sampler should be an instance of "
|
||||
"Sampler, but got sampler={}"
|
||||
.format(sampler))
|
||||
"Sampler, but got sampler={}".format(sampler))
|
||||
if not isinstance(batch_size, int) or batch_size <= 0:
|
||||
raise ValueError("batch_size should be a positive integer value, "
|
||||
"but got batch_size={}".format(batch_size))
|
||||
|
|
|
@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
|
|||
|
||||
For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
|
||||
|
||||
That is it!
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
That is it!
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from ruamel.yaml import YAML
|
||||
|
@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
|
|||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.batch import TextIDBatcher, WavBatcher
|
||||
|
||||
|
||||
class VCTK(Dataset):
|
||||
def __init__(self, root):
|
||||
assert isinstance(root, (str, Path)), "root should be a string or Path object"
|
||||
assert isinstance(root, (
|
||||
str, Path)), "root should be a string or Path object"
|
||||
self.root = root if isinstance(root, Path) else Path(root)
|
||||
self.text_root = self.root.joinpath("txt")
|
||||
self.wav_root = self.root.joinpath("wav48")
|
||||
|
||||
if not (self.root.joinpath("metadata.csv").exists() and
|
||||
if not (self.root.joinpath("metadata.csv").exists() and
|
||||
self.root.joinpath("speaker_indices.yaml").exists()):
|
||||
self._prepare_metadata()
|
||||
self.speaker_indices, self.metadata = self._load_metadata()
|
||||
|
||||
def _load_metadata(self):
|
||||
yaml=YAML(typ='safe')
|
||||
yaml = YAML(typ='safe')
|
||||
speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
|
||||
metadata = pd.read_csv(self.root.joinpath("metadata.csv"),
|
||||
sep="|", quoting=3, header=1)
|
||||
metadata = pd.read_csv(
|
||||
self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
|
||||
return speaker_indices, metadata
|
||||
|
||||
def _prepare_metadata(self):
|
||||
|
@ -41,15 +57,19 @@ class VCTK(Dataset):
|
|||
with io.open(str(text_file)) as f:
|
||||
transcription = f.read().strip()
|
||||
wav_file = text_file.with_suffix(".wav")
|
||||
metadata.append((wav_file.name, speaker_folder.name, transcription))
|
||||
metadata = pd.DataFrame.from_records(metadata,
|
||||
columns=["wave_file", "speaker", "text"])
|
||||
|
||||
metadata.append(
|
||||
(wav_file.name, speaker_folder.name, transcription))
|
||||
metadata = pd.DataFrame.from_records(
|
||||
metadata, columns=["wave_file", "speaker", "text"])
|
||||
|
||||
# save them
|
||||
yaml=YAML(typ='safe')
|
||||
yaml = YAML(typ='safe')
|
||||
yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
|
||||
metadata.to_csv(self.root.joinpath("metadata.csv"),
|
||||
sep="|", quoting=3, index=False)
|
||||
metadata.to_csv(
|
||||
self.root.joinpath("metadata.csv"),
|
||||
sep="|",
|
||||
quoting=3,
|
||||
index=False)
|
||||
|
||||
def _get_example(self, metadatum):
|
||||
wave_file, speaker, text = metadatum
|
||||
|
@ -77,5 +97,3 @@ class VCTK(Dataset):
|
|||
speaker_batch = np.array(speaker_batch)
|
||||
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
|
||||
return wav_batch, speaker_batch, phoneme_batch
|
||||
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
# coding: utf-8
|
||||
|
||||
"""Text processing frontend
|
||||
|
||||
All frontend module should have the following functions:
|
||||
|
|
|
@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
|
|||
from ..text import text_to_sequence
|
||||
text = text_to_sequence(text, ["english_cleaners"])
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
|
|||
from ..text import text_to_sequence
|
||||
text = text_to_sequence(text, ["basic_cleaners"])
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# coding: utf-8
|
||||
|
||||
|
||||
import MeCab
|
||||
import jaconv
|
||||
from random import random
|
||||
|
@ -30,9 +29,9 @@ def _yomi(mecab_result):
|
|||
|
||||
|
||||
def _mix_pronunciation(tokens, yomis, p):
|
||||
return "".join(
|
||||
yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
|
||||
for idx in range(len(tokens)))
|
||||
return "".join(yomis[idx]
|
||||
if yomis[idx] is not None and random() < p else tokens[idx]
|
||||
for idx in range(len(tokens)))
|
||||
|
||||
|
||||
def mix_pronunciation(text, p):
|
||||
|
@ -59,8 +58,7 @@ def normalize_delimitor(text):
|
|||
|
||||
|
||||
def text_to_sequence(text, p=0.0):
|
||||
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
|
||||
"(", ")", "(", ")"]:
|
||||
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
|
||||
text = text.replace(c, "")
|
||||
text = text.replace("!", "!")
|
||||
text = text.replace("?", "?")
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# coding: utf-8
|
||||
|
||||
|
||||
from random import random
|
||||
|
||||
n_vocab = 0xffff
|
||||
|
@ -13,5 +12,6 @@ _tagger = None
|
|||
def text_to_sequence(text, p=0.0):
|
||||
return [ord(c) for c in text] + [_eos] # EOS
|
||||
|
||||
|
||||
def sequence_to_text(seq):
|
||||
return "".join(chr(n) for n in seq)
|
||||
|
|
|
@ -1,8 +1,21 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
from . import cleaners
|
||||
from .symbols import symbols
|
||||
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
|
|||
if not m:
|
||||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
||||
break
|
||||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _symbols_to_sequence(
|
||||
_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
'''
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
|
@ -14,31 +27,31 @@ import re
|
|||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
|
|
|
@ -1,14 +1,28 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
||||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
||||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
||||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
||||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
||||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
||||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
|
||||
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
|
||||
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
|
||||
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
|
||||
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
|
||||
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
|
||||
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
|
||||
'Y', 'Z', 'ZH'
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
@ -24,7 +38,10 @@ class CMUDict:
|
|||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
||||
entries = {
|
||||
word: pron
|
||||
for word, pron in entries.items() if len(pron) == 1
|
||||
}
|
||||
self._entries = entries
|
||||
|
||||
def __len__(self):
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
import inflect
|
||||
import re
|
||||
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
|
@ -56,7 +55,8 @@ def _expand_number(m):
|
|||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
return _inflect.number_to_words(
|
||||
num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
|
||||
from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
|
||||
from parakeet.models.deepvoice3.converter import Converter
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from paddle import fluid
|
||||
|
@ -19,23 +33,19 @@ class Attention(dg.Layer):
|
|||
value_projection=True):
|
||||
super(Attention, self).__init__()
|
||||
std = np.sqrt(1 / query_dim)
|
||||
self.query_proj = Linear(query_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.query_proj = Linear(
|
||||
query_dim, embed_dim, param_attr=I.Normal(scale=std))
|
||||
if key_projection:
|
||||
std = np.sqrt(1 / embed_dim)
|
||||
self.key_proj = Linear(embed_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.key_proj = Linear(
|
||||
embed_dim, embed_dim, param_attr=I.Normal(scale=std))
|
||||
if value_projection:
|
||||
std = np.sqrt(1 / embed_dim)
|
||||
self.value_proj = Linear(embed_dim,
|
||||
embed_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.value_proj = Linear(
|
||||
embed_dim, embed_dim, param_attr=I.Normal(scale=std))
|
||||
std = np.sqrt(1 / embed_dim)
|
||||
self.out_proj = Linear(embed_dim,
|
||||
query_dim,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.out_proj = Linear(
|
||||
embed_dim, query_dim, param_attr=I.Normal(scale=std))
|
||||
|
||||
self.key_projection = key_projection
|
||||
self.value_projection = value_projection
|
||||
|
@ -102,9 +112,8 @@ class Attention(dg.Layer):
|
|||
|
||||
x = F.softmax(x)
|
||||
attn_scores = x
|
||||
x = F.dropout(x,
|
||||
self.dropout,
|
||||
dropout_implementation="upscale_in_train")
|
||||
x = F.dropout(
|
||||
x, self.dropout, dropout_implementation="upscale_in_train")
|
||||
x = F.matmul(x, values)
|
||||
encoder_length = keys.shape[1]
|
||||
# CAUTION: is it wrong? let it be now
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
|
||||
from paddle import fluid
|
||||
|
@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
|
|||
has residual connection from the input x, and scale the output by
|
||||
np.sqrt(0.5).
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
|
@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
|
|||
), "this block uses residual connection"\
|
||||
"the input_channes should equals num_filters"
|
||||
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
|
||||
self.conv = Conv1DCell(in_channels,
|
||||
2 * num_filters,
|
||||
filter_size,
|
||||
dilation,
|
||||
causal,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.conv = Conv1DCell(
|
||||
in_channels,
|
||||
2 * num_filters,
|
||||
filter_size,
|
||||
dilation,
|
||||
causal,
|
||||
param_attr=I.Normal(scale=std))
|
||||
|
||||
if n_speakers > 1:
|
||||
assert (speaker_dim is not None
|
||||
), "speaker embed should not be null in multi-speaker case"
|
||||
std = np.sqrt(1 / speaker_dim)
|
||||
self.fc = Linear(speaker_dim,
|
||||
num_filters,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.fc = Linear(
|
||||
speaker_dim, num_filters, param_attr=I.Normal(scale=std))
|
||||
|
||||
def forward(self, x, speaker_embed=None):
|
||||
"""
|
||||
|
@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
|
|||
C_out means the output channels of Conv1DGLU.
|
||||
"""
|
||||
residual = x
|
||||
x = F.dropout(x,
|
||||
self.dropout,
|
||||
dropout_implementation="upscale_in_train")
|
||||
x = F.dropout(
|
||||
x, self.dropout, dropout_implementation="upscale_in_train")
|
||||
x = self.conv(x)
|
||||
content, gate = F.split(x, num_or_sections=2, dim=1)
|
||||
|
||||
|
@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
|
|||
C_out means the output channels of Conv1DGLU.
|
||||
"""
|
||||
residual = x_t
|
||||
x_t = F.dropout(x_t,
|
||||
self.dropout,
|
||||
dropout_implementation="upscale_in_train")
|
||||
x_t = F.dropout(
|
||||
x_t, self.dropout, dropout_implementation="upscale_in_train")
|
||||
x_t = self.conv.add_input(x_t)
|
||||
content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)
|
||||
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from itertools import chain
|
||||
|
||||
|
@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
|||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=1,
|
||||
std_mul=1.,
|
||||
dropout=dropout),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout),
|
||||
Conv1DTranspose(
|
||||
Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=1,
|
||||
std_mul=1.,
|
||||
dropout=dropout),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout)
|
||||
3,
|
||||
dilation=1,
|
||||
std_mul=1.,
|
||||
dropout=dropout), Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout), Conv1DTranspose(
|
||||
target_channels,
|
||||
target_channels,
|
||||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(scale=np.sqrt(
|
||||
4. / (2 * target_channels)))), Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=1,
|
||||
std_mul=1.,
|
||||
dropout=dropout), Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout)
|
||||
]
|
||||
return upsampling_convolutions
|
||||
|
||||
|
@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
|||
2,
|
||||
stride=2,
|
||||
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=1,
|
||||
std_mul=1.,
|
||||
dropout=dropout),
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout)
|
||||
Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=1,
|
||||
std_mul=1.,
|
||||
dropout=dropout), Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout)
|
||||
]
|
||||
return upsampling_convolutions
|
||||
|
||||
|
||||
def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
|
||||
upsampling_convolutions = [
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout)
|
||||
Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
target_channels,
|
||||
target_channels,
|
||||
3,
|
||||
dilation=3,
|
||||
std_mul=4.,
|
||||
dropout=dropout)
|
||||
]
|
||||
return upsampling_convolutions
|
||||
|
||||
|
@ -108,6 +125,7 @@ class Converter(dg.Layer):
|
|||
Vocoder that transforms mel spectrogram (or ecoder hidden states)
|
||||
to waveform.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
|
@ -161,33 +179,36 @@ class Converter(dg.Layer):
|
|||
std = np.sqrt(std_mul / in_channels)
|
||||
# CAUTION: relu
|
||||
self.convolutions.append(
|
||||
Conv1D(in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
act="relu",
|
||||
param_attr=I.Normal(scale=std)))
|
||||
Conv1D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
act="relu",
|
||||
param_attr=I.Normal(scale=std)))
|
||||
in_channels = out_channels
|
||||
std_mul = 2.0
|
||||
self.convolutions.append(
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation=dilation,
|
||||
std_mul=std_mul,
|
||||
dropout=dropout))
|
||||
Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation=dilation,
|
||||
std_mul=std_mul,
|
||||
dropout=dropout))
|
||||
in_channels = out_channels
|
||||
std_mul = 4.0
|
||||
|
||||
# final conv proj, channel transformed to linear dim
|
||||
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
|
||||
# CAUTION: sigmoid
|
||||
self.last_conv_proj = Conv1D(in_channels,
|
||||
linear_dim,
|
||||
1,
|
||||
act="sigmoid",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.last_conv_proj = Conv1D(
|
||||
in_channels,
|
||||
linear_dim,
|
||||
1,
|
||||
act="sigmoid",
|
||||
param_attr=I.Normal(scale=std))
|
||||
|
||||
def forward(self, x, speaker_embed=None):
|
||||
"""
|
||||
|
@ -229,4 +250,4 @@ class Converter(dg.Layer):
|
|||
|
||||
out = self.last_conv_proj(x)
|
||||
out = F.transpose(out, [0, 2, 1])
|
||||
return out
|
||||
return out
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle.fluid.layers as F
|
||||
import paddle.fluid.initializer as I
|
||||
|
@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
|
|||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
embed_dim,
|
||||
mel_dim,
|
||||
r=1,
|
||||
max_positions=512,
|
||||
padding_idx=None, # remove it!
|
||||
preattention=(ConvSpec(128, 5, 1), ) * 4,
|
||||
convolutions=(ConvSpec(128, 5, 1), ) * 4,
|
||||
attention=True,
|
||||
dropout=0.0,
|
||||
use_memory_mask=False,
|
||||
force_monotonic_attention=False,
|
||||
query_position_rate=1.0,
|
||||
key_position_rate=1.0,
|
||||
window_range=WindowRange(-1, 3),
|
||||
key_projection=True,
|
||||
value_projection=True):
|
||||
self,
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
embed_dim,
|
||||
mel_dim,
|
||||
r=1,
|
||||
max_positions=512,
|
||||
padding_idx=None, # remove it!
|
||||
preattention=(ConvSpec(128, 5, 1), ) * 4,
|
||||
convolutions=(ConvSpec(128, 5, 1), ) * 4,
|
||||
attention=True,
|
||||
dropout=0.0,
|
||||
use_memory_mask=False,
|
||||
force_monotonic_attention=False,
|
||||
query_position_rate=1.0,
|
||||
key_position_rate=1.0,
|
||||
window_range=WindowRange(-1, 3),
|
||||
key_projection=True,
|
||||
value_projection=True):
|
||||
super(Decoder, self).__init__()
|
||||
|
||||
self.dropout = dropout
|
||||
|
@ -111,23 +125,17 @@ class Decoder(dg.Layer):
|
|||
|
||||
conv_channels = convolutions[0].out_channels
|
||||
# only when padding idx is 0 can we easilt handle it
|
||||
self.embed_keys_positions = PositionEmbedding(max_positions,
|
||||
embed_dim,
|
||||
padding_idx=0)
|
||||
self.embed_query_positions = PositionEmbedding(max_positions,
|
||||
conv_channels,
|
||||
padding_idx=0)
|
||||
self.embed_keys_positions = PositionEmbedding(
|
||||
max_positions, embed_dim, padding_idx=0)
|
||||
self.embed_query_positions = PositionEmbedding(
|
||||
max_positions, conv_channels, padding_idx=0)
|
||||
|
||||
if n_speakers > 1:
|
||||
std = np.sqrt((1 - dropout) / speaker_dim)
|
||||
self.speaker_proj1 = Linear(speaker_dim,
|
||||
1,
|
||||
act="sigmoid",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.speaker_proj2 = Linear(speaker_dim,
|
||||
1,
|
||||
act="sigmoid",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.speaker_proj1 = Linear(
|
||||
speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
|
||||
self.speaker_proj2 = Linear(
|
||||
speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
|
||||
|
||||
# prenet
|
||||
self.prenet = dg.LayerList()
|
||||
|
@ -138,24 +146,26 @@ class Decoder(dg.Layer):
|
|||
# conv1d & relu
|
||||
std = np.sqrt(std_mul / in_channels)
|
||||
self.prenet.append(
|
||||
Conv1D(in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
act="relu",
|
||||
param_attr=I.Normal(scale=std)))
|
||||
Conv1D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
act="relu",
|
||||
param_attr=I.Normal(scale=std)))
|
||||
in_channels = out_channels
|
||||
std_mul = 2.0
|
||||
self.prenet.append(
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul,
|
||||
dropout,
|
||||
causal=True,
|
||||
residual=True))
|
||||
Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul,
|
||||
dropout,
|
||||
causal=True,
|
||||
residual=True))
|
||||
in_channels = out_channels
|
||||
std_mul = 4.0
|
||||
|
||||
|
@ -184,16 +194,17 @@ class Decoder(dg.Layer):
|
|||
assert (
|
||||
in_channels == out_channels
|
||||
), "the stack of convolution & attention does not change channels"
|
||||
conv_layer = Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul,
|
||||
dropout,
|
||||
causal=True,
|
||||
residual=False)
|
||||
conv_layer = Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul,
|
||||
dropout,
|
||||
causal=True,
|
||||
residual=False)
|
||||
attn_layer = Attention(
|
||||
out_channels,
|
||||
embed_dim,
|
||||
|
@ -211,10 +222,8 @@ class Decoder(dg.Layer):
|
|||
|
||||
# 1 * 1 conv to transform channels
|
||||
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
|
||||
self.last_conv = Conv1D(in_channels,
|
||||
mel_dim * r,
|
||||
1,
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.last_conv = Conv1D(
|
||||
in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
|
||||
|
||||
# mel (before sigmoid) to done hat
|
||||
std = np.sqrt(1 / in_channels)
|
||||
|
@ -308,9 +317,8 @@ class Decoder(dg.Layer):
|
|||
# (B, C, T)
|
||||
frames = F.transpose(frames, [0, 2, 1])
|
||||
x = frames
|
||||
x = F.dropout(x,
|
||||
self.dropout,
|
||||
dropout_implementation="upscale_in_train")
|
||||
x = F.dropout(
|
||||
x, self.dropout, dropout_implementation="upscale_in_train")
|
||||
# Prenet
|
||||
for layer in self.prenet:
|
||||
if isinstance(layer, Conv1DGLU):
|
||||
|
@ -408,14 +416,13 @@ class Decoder(dg.Layer):
|
|||
test_inputs = fold_adjacent_frames(test_inputs, self.r)
|
||||
test_inputs = F.transpose(test_inputs, [0, 2, 1])
|
||||
|
||||
initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
|
||||
dtype=keys.dtype)
|
||||
initial_input = F.zeros(
|
||||
(batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
|
||||
|
||||
t = 0 # decoder time step
|
||||
while True:
|
||||
frame_pos = F.fill_constant((batch_size, 1),
|
||||
value=t + 1,
|
||||
dtype="int64")
|
||||
frame_pos = F.fill_constant(
|
||||
(batch_size, 1), value=t + 1, dtype="int64")
|
||||
w = self.query_position_rate
|
||||
if self.n_speakers > 1:
|
||||
w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
|
||||
|
@ -433,9 +440,8 @@ class Decoder(dg.Layer):
|
|||
current_input = initial_input
|
||||
|
||||
x_t = current_input
|
||||
x_t = F.dropout(x_t,
|
||||
self.dropout,
|
||||
dropout_implementation="upscale_in_train")
|
||||
x_t = F.dropout(
|
||||
x_t, self.dropout, dropout_implementation="upscale_in_train")
|
||||
|
||||
# Prenet
|
||||
for layer in self.prenet:
|
||||
|
@ -453,15 +459,15 @@ class Decoder(dg.Layer):
|
|||
x_t = F.transpose(x_t, [0, 2, 1])
|
||||
if frame_pos_embed is not None:
|
||||
x_t += frame_pos_embed
|
||||
x_t, attn_scores = attn(
|
||||
x_t, (keys, values), mask,
|
||||
last_attended[i] if test_inputs is None else None)
|
||||
x_t, attn_scores = attn(x_t, (keys, values), mask,
|
||||
last_attended[i]
|
||||
if test_inputs is None else None)
|
||||
x_t = F.transpose(x_t, [0, 2, 1])
|
||||
step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc)
|
||||
# update last attended when necessary
|
||||
if self.force_monotonic_attention[i]:
|
||||
last_attended[i] = np.argmax(attn_scores.numpy(),
|
||||
axis=-1)[0][0]
|
||||
last_attended[i] = np.argmax(
|
||||
attn_scores.numpy(), axis=-1)[0][0]
|
||||
x_t = F.scale(residual + x_t, np.sqrt(0.5))
|
||||
if len(step_attn_scores):
|
||||
# (B, 1, T_enc) again
|
||||
|
@ -485,8 +491,8 @@ class Decoder(dg.Layer):
|
|||
t += 1
|
||||
|
||||
if test_inputs is None:
|
||||
if F.reduce_min(done_t).numpy(
|
||||
)[0] > 0.5 and t > self.min_decoder_steps:
|
||||
if F.reduce_min(done_t).numpy()[
|
||||
0] > 0.5 and t > self.min_decoder_steps:
|
||||
break
|
||||
elif t > self.max_decoder_steps:
|
||||
break
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
|
||||
|
@ -33,14 +47,16 @@ class Encoder(dg.Layer):
|
|||
self.dropout = dropout
|
||||
if n_speakers > 1:
|
||||
std = np.sqrt((1 - dropout) / speaker_dim)
|
||||
self.sp_proj1 = Linear(speaker_dim,
|
||||
embed_dim,
|
||||
act="softsign",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.sp_proj2 = Linear(speaker_dim,
|
||||
embed_dim,
|
||||
act="softsign",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.sp_proj1 = Linear(
|
||||
speaker_dim,
|
||||
embed_dim,
|
||||
act="softsign",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.sp_proj2 = Linear(
|
||||
speaker_dim,
|
||||
embed_dim,
|
||||
act="softsign",
|
||||
param_attr=I.Normal(scale=std))
|
||||
self.n_speakers = n_speakers
|
||||
|
||||
self.convolutions = dg.LayerList()
|
||||
|
@ -51,31 +67,34 @@ class Encoder(dg.Layer):
|
|||
if in_channels != out_channels:
|
||||
std = np.sqrt(std_mul / in_channels)
|
||||
self.convolutions.append(
|
||||
Conv1D(in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
act="relu",
|
||||
param_attr=I.Normal(scale=std)))
|
||||
Conv1D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
act="relu",
|
||||
param_attr=I.Normal(scale=std)))
|
||||
in_channels = out_channels
|
||||
std_mul = 2.0
|
||||
|
||||
self.convolutions.append(
|
||||
Conv1DGLU(n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul,
|
||||
dropout,
|
||||
causal=False,
|
||||
residual=True))
|
||||
Conv1DGLU(
|
||||
n_speakers,
|
||||
speaker_dim,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size,
|
||||
dilation,
|
||||
std_mul,
|
||||
dropout,
|
||||
causal=False,
|
||||
residual=True))
|
||||
in_channels = out_channels
|
||||
std_mul = 4.0
|
||||
|
||||
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
|
||||
self.convolutions.append(
|
||||
Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
|
||||
Conv1D(
|
||||
in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
|
||||
|
||||
def forward(self, x, speaker_embed=None):
|
||||
"""
|
||||
|
@ -96,9 +115,8 @@ class Encoder(dg.Layer):
|
|||
representation for values.
|
||||
"""
|
||||
x = self.embed(x)
|
||||
x = F.dropout(x,
|
||||
self.dropout,
|
||||
dropout_implementation="upscale_in_train")
|
||||
x = F.dropout(
|
||||
x, self.dropout, dropout_implementation="upscale_in_train")
|
||||
x = F.transpose(x, [0, 2, 1])
|
||||
|
||||
if self.n_speakers > 1 and speaker_embed is not None:
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from numba import jit
|
||||
|
||||
|
@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
|
|||
return W
|
||||
|
||||
|
||||
def guided_attentions(encoder_lengths,
|
||||
decoder_lengths,
|
||||
max_decoder_len,
|
||||
def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
|
||||
g=0.2):
|
||||
B = len(encoder_lengths)
|
||||
max_input_len = encoder_lengths.max()
|
||||
|
@ -93,9 +105,8 @@ class TTSLoss(object):
|
|||
def binary_divergence(self, prediction, target, mask):
|
||||
flattened_prediction = F.reshape(prediction, [-1, 1])
|
||||
flattened_target = F.reshape(target, [-1, 1])
|
||||
flattened_loss = F.log_loss(flattened_prediction,
|
||||
flattened_target,
|
||||
epsilon=1e-8)
|
||||
flattened_loss = F.log_loss(
|
||||
flattened_prediction, flattened_target, epsilon=1e-8)
|
||||
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
|
||||
|
||||
w = self.masked_weight
|
||||
|
@ -163,23 +174,20 @@ class TTSLoss(object):
|
|||
max_mel_steps = max_frames // self.downsample_factor
|
||||
max_decoder_steps = max_mel_steps // self.r
|
||||
|
||||
decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
|
||||
self.r,
|
||||
max_decoder_steps,
|
||||
dtype="float32")
|
||||
mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
|
||||
max_mel_steps,
|
||||
dtype="float32")
|
||||
decoder_mask = F.sequence_mask(
|
||||
n_frames // self.downsample_factor // self.r,
|
||||
max_decoder_steps,
|
||||
dtype="float32")
|
||||
mel_mask = F.sequence_mask(
|
||||
n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
|
||||
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
|
||||
|
||||
if compute_lin_loss:
|
||||
lin_hyp = lin_hyp[:, :-self.time_shift, :]
|
||||
lin_ref = lin_ref[:, self.time_shift:, :]
|
||||
lin_mask = lin_mask[:, self.time_shift:, :]
|
||||
lin_l1_loss = self.l1_loss(lin_hyp,
|
||||
lin_ref,
|
||||
lin_mask,
|
||||
priority_bin=self.priority_bin)
|
||||
lin_l1_loss = self.l1_loss(
|
||||
lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
|
||||
lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
|
||||
lin_loss = self.binary_divergence_weight * lin_bce_loss \
|
||||
+ (1 - self.binary_divergence_weight) * lin_l1_loss
|
||||
|
@ -197,9 +205,10 @@ class TTSLoss(object):
|
|||
total_loss += mel_loss
|
||||
|
||||
if compute_attn_loss:
|
||||
attn_loss = self.attention_loss(
|
||||
attn_hyp, input_lengths.numpy(),
|
||||
n_frames.numpy() // (self.downsample_factor * self.r))
|
||||
attn_loss = self.attention_loss(attn_hyp,
|
||||
input_lengths.numpy(),
|
||||
n_frames.numpy() //
|
||||
(self.downsample_factor * self.r))
|
||||
total_loss += attn_loss
|
||||
|
||||
if compute_done_loss:
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
|
||||
import paddle.fluid.layers as F
|
||||
|
@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
|
|||
mel_outputs, alignments, done, decoder_states = self.decoder(
|
||||
(keys, values), valid_lengths, mel_inputs, text_positions,
|
||||
frame_positions, speaker_embed)
|
||||
linear_outputs = self.converter(
|
||||
decoder_states if self.use_decoder_states else mel_outputs,
|
||||
speaker_embed)
|
||||
linear_outputs = self.converter(decoder_states
|
||||
if self.use_decoder_states else
|
||||
mel_outputs, speaker_embed)
|
||||
return mel_outputs, linear_outputs, alignments, done
|
||||
|
||||
def transduce(self, text_sequences, text_positions, speaker_indices=None):
|
||||
|
@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
|
|||
keys, values = self.encoder(text_sequences, speaker_embed)
|
||||
mel_outputs, alignments, done, decoder_states = self.decoder.decode(
|
||||
(keys, values), text_positions, speaker_embed)
|
||||
linear_outputs = self.converter(
|
||||
decoder_states if self.use_decoder_states else mel_outputs,
|
||||
speaker_embed)
|
||||
linear_outputs = self.converter(decoder_states
|
||||
if self.use_decoder_states else
|
||||
mel_outputs, speaker_embed)
|
||||
return mel_outputs, linear_outputs, alignments, done
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from paddle import fluid
|
||||
import paddle.fluid.layers as F
|
||||
|
@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
|
|||
speaker_position_rate) # (B, V, C)
|
||||
# make indices for gather_nd
|
||||
batch_id = F.expand(
|
||||
F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
|
||||
[1, time_steps])
|
||||
F.unsqueeze(
|
||||
F.range(
|
||||
0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
|
||||
# (B, T, 2)
|
||||
gather_nd_id = F.stack([batch_id, indices], -1)
|
||||
|
||||
out = F.gather_nd(weight, gather_nd_id)
|
||||
return out
|
||||
return out
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -1,8 +1,22 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.fastspeech.fft_block import FFTBlock
|
||||
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self,
|
||||
len_max_seq,
|
||||
|
@ -18,16 +32,29 @@ class Decoder(dg.Layer):
|
|||
super(Decoder, self).__init__()
|
||||
|
||||
n_position = len_max_seq + 1
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
self.pos_inp = get_sinusoid_encoding_table(
|
||||
n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(
|
||||
size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [
|
||||
FFTBlock(
|
||||
d_model,
|
||||
d_inner,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=dropout) for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
|
||||
def forward(self, enc_seq, enc_pos):
|
||||
"""
|
||||
Decoder layer of FastSpeech.
|
||||
|
@ -57,4 +84,4 @@ class Decoder(dg.Layer):
|
|||
slf_attn_mask=slf_attn_mask)
|
||||
dec_slf_attn_list += [dec_slf_attn]
|
||||
|
||||
return dec_output, dec_slf_attn_list
|
||||
return dec_output, dec_slf_attn_list
|
||||
|
|
|
@ -1,8 +1,22 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.fastspeech.fft_block import FFTBlock
|
||||
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self,
|
||||
n_src_vocab,
|
||||
|
@ -19,14 +33,28 @@ class Encoder(dg.Layer):
|
|||
super(Encoder, self).__init__()
|
||||
n_position = len_max_seq + 1
|
||||
|
||||
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
self.src_word_emb = dg.Embedding(
|
||||
size=[n_src_vocab, d_model], padding_idx=0)
|
||||
self.pos_inp = get_sinusoid_encoding_table(
|
||||
n_position, d_model, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(
|
||||
size=[n_position, d_model],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [
|
||||
FFTBlock(
|
||||
d_model,
|
||||
d_inner,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=dropout) for _ in range(n_layers)
|
||||
]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
|
@ -52,7 +80,8 @@ class Encoder(dg.Layer):
|
|||
non_pad_mask = get_non_pad_mask(character)
|
||||
|
||||
# -- Forward
|
||||
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
||||
enc_output = self.src_word_emb(character) + self.position_enc(
|
||||
text_pos) #(N, T, C)
|
||||
|
||||
for enc_layer in self.layer_stack:
|
||||
enc_output, enc_slf_attn = enc_layer(
|
||||
|
@ -60,5 +89,5 @@ class Encoder(dg.Layer):
|
|||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
enc_slf_attn_list += [enc_slf_attn]
|
||||
|
||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
||||
|
||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
|
@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
|
|||
from parakeet.models.fastspeech.encoder import Encoder
|
||||
from parakeet.models.fastspeech.decoder import Decoder
|
||||
|
||||
|
||||
class FastSpeech(dg.Layer):
|
||||
def __init__(self, cfg):
|
||||
" FastSpeech"
|
||||
super(FastSpeech, self).__init__()
|
||||
|
||||
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
|
||||
len_max_seq=cfg['max_seq_len'],
|
||||
n_layers=cfg['encoder_n_layer'],
|
||||
n_head=cfg['encoder_head'],
|
||||
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
|
||||
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
|
||||
d_model=cfg['fs_hidden_size'],
|
||||
d_inner=cfg['encoder_conv1d_filter_size'],
|
||||
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
|
||||
fft_conv1d_padding=cfg['fft_conv1d_padding'],
|
||||
dropout=0.1)
|
||||
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
|
||||
out_channels=cfg['duration_predictor_output_size'],
|
||||
filter_size=cfg['duration_predictor_filter_size'],
|
||||
dropout=cfg['dropout'])
|
||||
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
|
||||
n_layers=cfg['decoder_n_layer'],
|
||||
n_head=cfg['decoder_head'],
|
||||
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
|
||||
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
|
||||
d_model=cfg['fs_hidden_size'],
|
||||
d_inner=cfg['decoder_conv1d_filter_size'],
|
||||
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
|
||||
fft_conv1d_padding=cfg['fft_conv1d_padding'],
|
||||
dropout=0.1)
|
||||
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
|
||||
self.encoder = Encoder(
|
||||
n_src_vocab=len(symbols) + 1,
|
||||
len_max_seq=cfg['max_seq_len'],
|
||||
n_layers=cfg['encoder_n_layer'],
|
||||
n_head=cfg['encoder_head'],
|
||||
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
|
||||
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
|
||||
d_model=cfg['fs_hidden_size'],
|
||||
d_inner=cfg['encoder_conv1d_filter_size'],
|
||||
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
|
||||
fft_conv1d_padding=cfg['fft_conv1d_padding'],
|
||||
dropout=0.1)
|
||||
self.length_regulator = LengthRegulator(
|
||||
input_size=cfg['fs_hidden_size'],
|
||||
out_channels=cfg['duration_predictor_output_size'],
|
||||
filter_size=cfg['duration_predictor_filter_size'],
|
||||
dropout=cfg['dropout'])
|
||||
self.decoder = Decoder(
|
||||
len_max_seq=cfg['max_seq_len'],
|
||||
n_layers=cfg['decoder_n_layer'],
|
||||
n_head=cfg['decoder_head'],
|
||||
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
|
||||
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
|
||||
d_model=cfg['fs_hidden_size'],
|
||||
d_inner=cfg['decoder_conv1d_filter_size'],
|
||||
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
|
||||
fft_conv1d_padding=cfg['fft_conv1d_padding'],
|
||||
dropout=0.1)
|
||||
self.weight = fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer())
|
||||
k = math.sqrt(1 / cfg['fs_hidden_size'])
|
||||
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
|
||||
self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
|
||||
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
|
||||
param_attr = self.weight,
|
||||
bias_attr = self.bias,)
|
||||
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
padding=int(5 / 2),
|
||||
num_conv=5,
|
||||
outputs_per_step=cfg['audio']['outputs_per_step'],
|
||||
use_cudnn=True,
|
||||
dropout=0.1,
|
||||
batchnorm_last=True)
|
||||
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))
|
||||
self.mel_linear = dg.Linear(
|
||||
cfg['fs_hidden_size'],
|
||||
cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
|
||||
param_attr=self.weight,
|
||||
bias_attr=self.bias, )
|
||||
self.postnet = PostConvNet(
|
||||
n_mels=cfg['audio']['num_mels'],
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
padding=int(5 / 2),
|
||||
num_conv=5,
|
||||
outputs_per_step=cfg['audio']['outputs_per_step'],
|
||||
use_cudnn=True,
|
||||
dropout=0.1,
|
||||
batchnorm_last=True)
|
||||
|
||||
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
|
||||
def forward(self,
|
||||
character,
|
||||
text_pos,
|
||||
mel_pos=None,
|
||||
length_target=None,
|
||||
alpha=1.0):
|
||||
"""
|
||||
FastSpeech model.
|
||||
|
||||
|
@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
|
|||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||
"""
|
||||
|
||||
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
|
||||
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
|
||||
character, text_pos)
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
|
||||
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
|
||||
target=length_target,
|
||||
alpha=alpha)
|
||||
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
|
||||
|
||||
length_regulator_output, duration_predictor_output = self.length_regulator(
|
||||
encoder_output, target=length_target, alpha=alpha)
|
||||
decoder_output, dec_slf_attn_list = self.decoder(
|
||||
length_regulator_output, mel_pos)
|
||||
|
||||
mel_output = self.mel_linear(decoder_output)
|
||||
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||
|
||||
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
|
||||
else:
|
||||
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
|
||||
decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
|
||||
length_regulator_output, decoder_pos = self.length_regulator(
|
||||
encoder_output, alpha=alpha)
|
||||
decoder_output, _ = self.decoder(length_regulator_output,
|
||||
decoder_pos)
|
||||
mel_output = self.mel_linear(decoder_output)
|
||||
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||
|
||||
return mel_output, mel_output_postnet
|
||||
return mel_output, mel_output_postnet
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
@ -6,11 +19,32 @@ import paddle.fluid as fluid
|
|||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||
|
||||
|
||||
class FFTBlock(dg.Layer):
|
||||
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
d_inner,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
filter_size,
|
||||
padding,
|
||||
dropout=0.2):
|
||||
super(FFTBlock, self).__init__()
|
||||
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
|
||||
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
||||
self.slf_attn = MultiheadAttention(
|
||||
d_model,
|
||||
d_k,
|
||||
d_v,
|
||||
num_head=n_head,
|
||||
is_bias=True,
|
||||
dropout=dropout,
|
||||
is_concat=False)
|
||||
self.pos_ffn = PositionwiseFeedForward(
|
||||
d_model,
|
||||
d_inner,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
dropout=dropout)
|
||||
|
||||
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
||||
"""
|
||||
|
@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
|
|||
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
|
||||
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
|
||||
"""
|
||||
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||
output, slf_attn = self.slf_attn(
|
||||
enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||
output *= non_pad_mask
|
||||
|
||||
output = self.pos_ffn(output)
|
||||
output *= non_pad_mask
|
||||
|
||||
return output, slf_attn
|
||||
return output, slf_attn
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import math
|
||||
import parakeet.models.fastspeech.utils
|
||||
|
@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
|
|||
import paddle.fluid as fluid
|
||||
from parakeet.modules.customized import Conv1D
|
||||
|
||||
|
||||
class LengthRegulator(dg.Layer):
|
||||
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
||||
super(LengthRegulator, self).__init__()
|
||||
self.duration_predictor = DurationPredictor(input_size=input_size,
|
||||
out_channels=out_channels,
|
||||
filter_size=filter_size,
|
||||
dropout=dropout)
|
||||
self.duration_predictor = DurationPredictor(
|
||||
input_size=input_size,
|
||||
out_channels=out_channels,
|
||||
filter_size=filter_size,
|
||||
dropout=dropout)
|
||||
|
||||
def LR(self, x, duration_predictor_output, alpha=1.0):
|
||||
output = []
|
||||
batch_size = x.shape[0]
|
||||
for i in range(batch_size):
|
||||
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
|
||||
output.append(
|
||||
self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
|
||||
alpha))
|
||||
output = self.pad(output)
|
||||
return output
|
||||
|
||||
|
||||
def pad(self, input_ele):
|
||||
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
|
||||
out_list = []
|
||||
for i in range(len(input_ele)):
|
||||
pad_len = max_len - input_ele[i].shape[0]
|
||||
one_batch_padded = layers.pad(
|
||||
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
|
||||
one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
|
||||
pad_value=0.0)
|
||||
out_list.append(one_batch_padded)
|
||||
out_padded = layers.stack(out_list)
|
||||
return out_padded
|
||||
|
||||
|
||||
def expand(self, batch, predicted, alpha):
|
||||
out = []
|
||||
time_steps = batch.shape[1]
|
||||
fertilities = predicted.numpy()
|
||||
batch = layers.squeeze(batch,[0])
|
||||
|
||||
|
||||
batch = layers.squeeze(batch, [0])
|
||||
|
||||
for i in range(time_steps):
|
||||
if fertilities[0,i]==0:
|
||||
if fertilities[0, i] == 0:
|
||||
continue
|
||||
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
|
||||
out.append(
|
||||
layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
|
||||
out = layers.concat(out, axis=0)
|
||||
return out
|
||||
|
||||
|
||||
def forward(self, x, alpha=1.0, target=None):
|
||||
"""
|
||||
|
@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
|
|||
else:
|
||||
duration_predictor_output = layers.round(duration_predictor_output)
|
||||
output = self.LR(x, duration_predictor_output, alpha)
|
||||
mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
|
||||
mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
|
||||
mel_pos = layers.unsqueeze(mel_pos, [0])
|
||||
return output, mel_pos
|
||||
|
||||
|
||||
class DurationPredictor(dg.Layer):
|
||||
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
||||
super(DurationPredictor, self).__init__()
|
||||
|
@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
|
|||
self.dropout = dropout
|
||||
|
||||
k = math.sqrt(1 / self.input_size)
|
||||
self.conv1 = Conv1D(num_channels = self.input_size,
|
||||
num_filters = self.out_channels,
|
||||
filter_size = self.filter_size,
|
||||
padding=1,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||
#data_format='NTC')
|
||||
self.conv1 = Conv1D(
|
||||
num_channels=self.input_size,
|
||||
num_filters=self.out_channels,
|
||||
filter_size=self.filter_size,
|
||||
padding=1,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
#data_format='NTC')
|
||||
k = math.sqrt(1 / self.out_channels)
|
||||
self.conv2 = Conv1D(num_channels = self.out_channels,
|
||||
num_filters = self.out_channels,
|
||||
filter_size = self.filter_size,
|
||||
padding=1,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||
#data_format='NTC')
|
||||
self.conv2 = Conv1D(
|
||||
num_channels=self.out_channels,
|
||||
num_filters=self.out_channels,
|
||||
filter_size=self.filter_size,
|
||||
padding=1,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
#data_format='NTC')
|
||||
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
||||
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
||||
|
||||
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
|
||||
self.weight = fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer())
|
||||
k = math.sqrt(1 / self.out_channels)
|
||||
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
|
||||
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))
|
||||
|
||||
self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
|
||||
bias_attr = self.bias)
|
||||
self.linear = dg.Linear(
|
||||
self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
|
||||
|
||||
def forward(self, encoder_output):
|
||||
"""
|
||||
|
@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
|
|||
out (Variable), Shape(B, T, C), the output of duration predictor.
|
||||
"""
|
||||
# encoder_output.shape(N, T, C)
|
||||
out = layers.transpose(encoder_output, [0,2,1])
|
||||
out = layers.transpose(encoder_output, [0, 2, 1])
|
||||
out = self.conv1(out)
|
||||
out = layers.transpose(out, [0,2,1])
|
||||
out = layers.transpose(out, [0, 2, 1])
|
||||
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
|
||||
out = layers.transpose(out, [0,2,1])
|
||||
out = layers.transpose(out, [0, 2, 1])
|
||||
out = self.conv2(out)
|
||||
out = layers.transpose(out, [0,2,1])
|
||||
out = layers.transpose(out, [0, 2, 1])
|
||||
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
|
||||
out = layers.relu(self.linear(out))
|
||||
out = layers.squeeze(out, axes=[-1])
|
||||
|
||||
|
||||
return out
|
||||
|
||||
|
||||
return out
|
||||
|
|
|
@ -1,5 +1,19 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_alignment(attn_probs, mel_lens, n_head):
|
||||
max_F = 0
|
||||
assert attn_probs[0].shape[0] % n_head == 0
|
||||
|
@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
|
|||
for i in range(len(attn_probs)):
|
||||
multi_attn = attn_probs[i].numpy()
|
||||
for j in range(n_head):
|
||||
attn = multi_attn[j*batch_size:(j+1)*batch_size]
|
||||
attn = multi_attn[j * batch_size:(j + 1) * batch_size]
|
||||
F = score_F(attn)
|
||||
if max_F < F:
|
||||
max_F = F
|
||||
max_attn = attn
|
||||
alignment = compute_duration(max_attn, mel_lens)
|
||||
return alignment
|
||||
|
||||
|
||||
|
||||
def score_F(attn):
|
||||
max = np.max(attn, axis=-1)
|
||||
mean = np.mean(max)
|
||||
return mean
|
||||
|
||||
|
||||
def compute_duration(attn, mel_lens):
|
||||
alignment = np.zeros([attn.shape[0],attn.shape[2]])
|
||||
alignment = np.zeros([attn.shape[0], attn.shape[2]])
|
||||
mel_lens = mel_lens.numpy()
|
||||
for i in range(attn.shape[0]):
|
||||
for j in range(mel_lens[i]):
|
||||
max_index = np.argmax(attn[i,j])
|
||||
alignment[i,max_index] += 1
|
||||
max_index = np.argmax(attn[i, j])
|
||||
alignment[i, max_index] += 1
|
||||
|
||||
return alignment
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
|
|||
from parakeet.modules.dynamic_gru import DynamicGRU
|
||||
import numpy as np
|
||||
|
||||
|
||||
class CBHG(dg.Layer):
|
||||
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
|
||||
max_pool_kernel_size=2, is_post=False):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
batch_size,
|
||||
K=16,
|
||||
projection_size=256,
|
||||
num_gru_layers=2,
|
||||
max_pool_kernel_size=2,
|
||||
is_post=False):
|
||||
super(CBHG, self).__init__()
|
||||
"""
|
||||
:param hidden_size: dimension of hidden unit
|
||||
|
@ -24,28 +44,39 @@ class CBHG(dg.Layer):
|
|||
self.projection_size = projection_size
|
||||
self.conv_list = []
|
||||
k = math.sqrt(1 / projection_size)
|
||||
self.conv_list.append(Conv1D(num_channels = projection_size,
|
||||
num_filters = hidden_size,
|
||||
filter_size = 1,
|
||||
padding = int(np.floor(1/2)),
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=projection_size,
|
||||
num_filters=hidden_size,
|
||||
filter_size=1,
|
||||
padding=int(np.floor(1 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
k = math.sqrt(1 / hidden_size)
|
||||
for i in range(2,K+1):
|
||||
self.conv_list.append(Conv1D(num_channels = hidden_size,
|
||||
num_filters = hidden_size,
|
||||
filter_size = i,
|
||||
padding = int(np.floor(i/2)),
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
|
||||
for i in range(2, K + 1):
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=hidden_size,
|
||||
num_filters=hidden_size,
|
||||
filter_size=i,
|
||||
padding=int(np.floor(i / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batchnorm_list = []
|
||||
for i in range(K):
|
||||
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
|
||||
data_layout='NCHW'))
|
||||
self.batchnorm_list.append(
|
||||
dg.BatchNorm(
|
||||
hidden_size, data_layout='NCHW'))
|
||||
|
||||
for i, layer in enumerate(self.batchnorm_list):
|
||||
self.add_sublayer("batchnorm_list_{}".format(i), layer)
|
||||
|
@ -53,91 +84,120 @@ class CBHG(dg.Layer):
|
|||
conv_outdim = hidden_size * K
|
||||
|
||||
k = math.sqrt(1 / conv_outdim)
|
||||
self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
|
||||
num_filters = hidden_size,
|
||||
filter_size = 3,
|
||||
padding = int(np.floor(3/2)),
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.conv_projection_1 = Conv1D(
|
||||
num_channels=conv_outdim,
|
||||
num_filters=hidden_size,
|
||||
filter_size=3,
|
||||
padding=int(np.floor(3 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
k = math.sqrt(1 / hidden_size)
|
||||
self.conv_projection_2 = Conv1D(num_channels = hidden_size,
|
||||
num_filters = projection_size,
|
||||
filter_size = 3,
|
||||
padding = int(np.floor(3/2)),
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.conv_projection_2 = Conv1D(
|
||||
num_channels=hidden_size,
|
||||
num_filters=projection_size,
|
||||
filter_size=3,
|
||||
padding=int(np.floor(3 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
||||
data_layout='NCHW')
|
||||
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
|
||||
data_layout='NCHW')
|
||||
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=1,
|
||||
data_format = "NCT")
|
||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
|
||||
self.batchnorm_proj_2 = dg.BatchNorm(
|
||||
projection_size, data_layout='NCHW')
|
||||
self.max_pool = Pool1D(
|
||||
pool_size=max_pool_kernel_size,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=1,
|
||||
data_format="NCT")
|
||||
self.highway = Highwaynet(self.projection_size)
|
||||
|
||||
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
|
||||
h_0 = dg.to_variable(h_0)
|
||||
k = math.sqrt(1 / hidden_size)
|
||||
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse = False,
|
||||
origin_mode = True,
|
||||
h_0 = h_0)
|
||||
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0 = h_0)
|
||||
self.fc_forward1 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.fc_reverse1 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.gru_forward1 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=False,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
self.gru_reverse1 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
|
||||
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse = False,
|
||||
origin_mode = True,
|
||||
h_0 = h_0)
|
||||
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0 = h_0)
|
||||
self.fc_forward2 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.fc_reverse2 = dg.Linear(
|
||||
hidden_size,
|
||||
hidden_size // 2 * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.gru_forward2 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=False,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
self.gru_reverse2 = DynamicGRU(
|
||||
size=self.hidden_size // 2,
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0=h_0)
|
||||
|
||||
def _conv_fit_dim(self, x, filter_size=3):
|
||||
if filter_size % 2 == 0:
|
||||
return x[:,:,:-1]
|
||||
return x[:, :, :-1]
|
||||
else:
|
||||
return x
|
||||
return x
|
||||
|
||||
def forward(self, input_):
|
||||
# input_.shape = [N, C, T]
|
||||
|
||||
conv_list = []
|
||||
conv_input = input_
|
||||
|
||||
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
|
||||
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
|
||||
|
||||
for i, (conv, batchnorm
|
||||
) in enumerate(zip(self.conv_list, self.batchnorm_list)):
|
||||
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
|
||||
conv_input = layers.relu(batchnorm(conv_input))
|
||||
conv_list.append(conv_input)
|
||||
|
||||
|
||||
conv_cat = layers.concat(conv_list, axis=1)
|
||||
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
|
||||
|
||||
|
||||
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
|
||||
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
|
||||
|
||||
conv_pool = self.max_pool(conv_cat)[:, :, :-1]
|
||||
|
||||
conv_proj = layers.relu(
|
||||
self.batchnorm_proj_1(
|
||||
self._conv_fit_dim(self.conv_projection_1(conv_pool))))
|
||||
conv_proj = self.batchnorm_proj_2(
|
||||
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
|
||||
|
||||
# conv_proj.shape = [N, C, T]
|
||||
highway = layers.transpose(conv_proj, [0,2,1])
|
||||
highway = layers.transpose(conv_proj, [0, 2, 1])
|
||||
highway = self.highway(highway)
|
||||
|
||||
# highway.shape = [N, T, C]
|
||||
|
@ -151,9 +211,10 @@ class CBHG(dg.Layer):
|
|||
out_forward = self.gru_forward2(fc_forward)
|
||||
out_reverse = self.gru_reverse2(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
out = layers.transpose(out, [0,2,1])
|
||||
out = layers.transpose(out, [0, 2, 1])
|
||||
return out
|
||||
|
||||
|
||||
class Highwaynet(dg.Layer):
|
||||
def __init__(self, num_units, num_layers=4):
|
||||
super(Highwaynet, self).__init__()
|
||||
|
@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
|
|||
self.linears = []
|
||||
k = math.sqrt(1 / num_units)
|
||||
for i in range(num_layers):
|
||||
self.linears.append(dg.Linear(num_units, num_units,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
|
||||
self.gates.append(dg.Linear(num_units, num_units,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
|
||||
|
||||
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
|
||||
self.linears.append(
|
||||
dg.Linear(
|
||||
num_units,
|
||||
num_units,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
self.gates.append(
|
||||
dg.Linear(
|
||||
num_units,
|
||||
num_units,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))))
|
||||
|
||||
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
|
||||
self.add_sublayer("linears_{}".format(i), linear)
|
||||
self.add_sublayer("gates_{}".format(i), gate)
|
||||
|
||||
|
@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
|
|||
t_ = fluid.layers.sigmoid(gate(out))
|
||||
|
||||
c = 1 - t_
|
||||
out = h * t_ + out * c
|
||||
|
||||
out = h * t_ + out * c
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
|
@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
|
|||
from parakeet.models.transformer_tts.prenet import PreNet
|
||||
from parakeet.models.transformer_tts.post_convnet import PostConvNet
|
||||
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self, num_hidden, config, num_head=4):
|
||||
super(Decoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr()
|
||||
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
|
||||
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
|
||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
|
||||
hidden_size = num_hidden * 2,
|
||||
output_size = num_hidden,
|
||||
dropout_rate=0.2)
|
||||
self.alpha = self.create_parameter(
|
||||
shape=(1, ),
|
||||
attr=param,
|
||||
dtype='float32',
|
||||
default_initializer=fluid.initializer.ConstantInitializer(
|
||||
value=1.0))
|
||||
self.pos_inp = get_sinusoid_encoding_table(
|
||||
1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(
|
||||
size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
self.pos_inp),
|
||||
trainable=False))
|
||||
self.decoder_prenet = PreNet(
|
||||
input_size=config['audio']['num_mels'],
|
||||
hidden_size=num_hidden * 2,
|
||||
output_size=num_hidden,
|
||||
dropout_rate=0.2)
|
||||
k = math.sqrt(1 / num_hidden)
|
||||
self.linear = dg.Linear(num_hidden, num_hidden,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.linear = dg.Linear(
|
||||
num_hidden,
|
||||
num_hidden,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
self.selfattn_layers = [
|
||||
MultiheadAttention(num_hidden, num_hidden // num_head,
|
||||
num_hidden // num_head) for _ in range(3)
|
||||
]
|
||||
for i, layer in enumerate(self.selfattn_layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
self.attn_layers = [
|
||||
MultiheadAttention(num_hidden, num_hidden // num_head,
|
||||
num_hidden // num_head) for _ in range(3)
|
||||
]
|
||||
for i, layer in enumerate(self.attn_layers):
|
||||
self.add_sublayer("attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
|
||||
self.ffns = [
|
||||
PositionwiseFeedForward(
|
||||
num_hidden, num_hidden * num_head, filter_size=1)
|
||||
for _ in range(3)
|
||||
]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.stop_linear = dg.Linear(num_hidden, 1,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.mel_linear = dg.Linear(
|
||||
num_hidden,
|
||||
config['audio']['num_mels'] * config['audio']['outputs_per_step'],
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
self.stop_linear = dg.Linear(
|
||||
num_hidden,
|
||||
1,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
|
||||
filter_size = 5, padding = 4, num_conv=5,
|
||||
outputs_per_step=config['audio']['outputs_per_step'],
|
||||
use_cudnn = True)
|
||||
self.postconvnet = PostConvNet(
|
||||
config['audio']['num_mels'],
|
||||
config['hidden_size'],
|
||||
filter_size=5,
|
||||
padding=4,
|
||||
num_conv=5,
|
||||
outputs_per_step=config['audio']['outputs_per_step'],
|
||||
use_cudnn=True)
|
||||
|
||||
def forward(self, key, value, query, c_mask, positional):
|
||||
|
||||
# get decoder mask with triangular matrix
|
||||
|
||||
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
m_mask = get_non_pad_mask(positional)
|
||||
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
|
||||
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
|
||||
mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
|
||||
query)
|
||||
triu_tensor = dg.to_variable(
|
||||
get_triu_tensor(query.numpy(), query.numpy())).astype(
|
||||
np.float32)
|
||||
mask = mask + triu_tensor
|
||||
mask = fluid.layers.cast(mask == 0, np.float32)
|
||||
|
||||
|
||||
# (batch_size, decoder_len, encoder_len)
|
||||
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
|
||||
zero_mask = get_attn_key_pad_mask(
|
||||
layers.squeeze(c_mask, [-1]), query)
|
||||
else:
|
||||
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
|
||||
mask = get_triu_tensor(query.numpy(),
|
||||
query.numpy()).astype(np.float32)
|
||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
||||
m_mask, zero_mask = None, None
|
||||
|
||||
# Decoder pre-network
|
||||
query = self.decoder_prenet(query)
|
||||
|
||||
|
||||
# Centered position
|
||||
query = self.linear(query)
|
||||
|
||||
|
@ -84,10 +137,13 @@ class Decoder(dg.Layer):
|
|||
# Attention decoder-decoder, encoder-decoder
|
||||
selfattn_list = list()
|
||||
attn_list = list()
|
||||
|
||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
|
||||
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
|
||||
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
|
||||
|
||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
|
||||
self.ffns):
|
||||
query, attn_dec = selfattn(
|
||||
query, query, query, mask=mask, query_mask=m_mask)
|
||||
query, attn_dot = attn(
|
||||
key, value, query, mask=zero_mask, query_mask=m_mask)
|
||||
query = ffn(query)
|
||||
selfattn_list.append(attn_dec)
|
||||
attn_list.append(attn_dot)
|
||||
|
@ -96,7 +152,7 @@ class Decoder(dg.Layer):
|
|||
# Post Mel Network
|
||||
out = self.postconvnet(mel_out)
|
||||
out = mel_out + out
|
||||
|
||||
|
||||
# Stop tokens
|
||||
stop_tokens = self.stop_linear(query)
|
||||
stop_tokens = layers.squeeze(stop_tokens, [-1])
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
|
@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
|
|||
from parakeet.modules.ffn import PositionwiseFeedForward
|
||||
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
|
||||
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, num_head=4):
|
||||
super(Encoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
|
||||
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
|
||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
||||
num_hidden = num_hidden,
|
||||
use_cudnn=True)
|
||||
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
|
||||
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
|
||||
value=1.0))
|
||||
self.alpha = self.create_parameter(
|
||||
shape=(1, ), attr=param, dtype='float32')
|
||||
self.pos_inp = get_sinusoid_encoding_table(
|
||||
1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(
|
||||
size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(
|
||||
self.pos_inp),
|
||||
trainable=False))
|
||||
self.encoder_prenet = EncoderPrenet(
|
||||
embedding_size=embedding_size,
|
||||
num_hidden=num_hidden,
|
||||
use_cudnn=True)
|
||||
self.layers = [
|
||||
MultiheadAttention(num_hidden, num_hidden // num_head,
|
||||
num_hidden // num_head) for _ in range(3)
|
||||
]
|
||||
for i, layer in enumerate(self.layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
|
||||
self.ffns = [
|
||||
PositionwiseFeedForward(
|
||||
num_hidden,
|
||||
num_hidden * num_head,
|
||||
filter_size=1,
|
||||
use_cudnn=True) for _ in range(3)
|
||||
]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
|
||||
|
@ -33,25 +62,23 @@ class Encoder(dg.Layer):
|
|||
mask = get_attn_key_pad_mask(positional, x)
|
||||
else:
|
||||
query_mask, mask = None, None
|
||||
|
||||
|
||||
# Encoder pre_network
|
||||
x = self.encoder_prenet(x) #(N,T,C)
|
||||
|
||||
|
||||
x = self.encoder_prenet(x) #(N,T,C)
|
||||
|
||||
# Get positional encoding
|
||||
positional = self.pos_emb(positional)
|
||||
|
||||
x = positional * self.alpha + x #(N, T, C)
|
||||
|
||||
positional = self.pos_emb(positional)
|
||||
|
||||
x = positional * self.alpha + x #(N, T, C)
|
||||
|
||||
# Positional dropout
|
||||
x = layers.dropout(x, 0.1)
|
||||
|
||||
|
||||
# Self attention encoder
|
||||
attentions = list()
|
||||
for layer, ffn in zip(self.layers, self.ffns):
|
||||
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
|
||||
x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
|
||||
x = ffn(x)
|
||||
attentions.append(attention)
|
||||
|
||||
return x, query_mask, attentions
|
||||
return x, query_mask, attentions
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
|
|||
self.embedding_size = embedding_size
|
||||
self.num_hidden = num_hidden
|
||||
self.use_cudnn = use_cudnn
|
||||
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
|
||||
padding_idx = None)
|
||||
self.embedding = dg.Embedding(
|
||||
size=[len(symbols), embedding_size], padding_idx=None)
|
||||
self.conv_list = []
|
||||
k = math.sqrt(1 / embedding_size)
|
||||
self.conv_list.append(Conv1D(num_channels = embedding_size,
|
||||
num_filters = num_hidden,
|
||||
filter_size = 5,
|
||||
padding = int(np.floor(5/2)),
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn))
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=embedding_size,
|
||||
num_filters=num_hidden,
|
||||
filter_size=5,
|
||||
padding=int(np.floor(5 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
k = math.sqrt(1 / num_hidden)
|
||||
for _ in range(2):
|
||||
self.conv_list.append(Conv1D(num_channels = num_hidden,
|
||||
num_filters = num_hidden,
|
||||
filter_size = 5,
|
||||
padding = int(np.floor(5/2)),
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn))
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=num_hidden,
|
||||
filter_size=5,
|
||||
padding=int(np.floor(5 / 2)),
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||
data_layout='NCHW') for _ in range(3)]
|
||||
self.batch_norm_list = [
|
||||
dg.BatchNorm(
|
||||
num_hidden, data_layout='NCHW') for _ in range(3)
|
||||
]
|
||||
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
k = math.sqrt(1 / num_hidden)
|
||||
self.projection = dg.Linear(num_hidden, num_hidden,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.projection = dg.Linear(
|
||||
num_hidden,
|
||||
num_hidden,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
||||
x = layers.transpose(x,[0,2,1])
|
||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
||||
x = layers.transpose(x, [0, 2, 1])
|
||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
|
||||
x = layers.transpose(x,[0,2,1]) #(N,T,C)
|
||||
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
|
||||
x = self.projection(x)
|
||||
|
||||
return x
|
||||
return x
|
||||
|
|
|
@ -1,11 +1,25 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.customized import Conv1D
|
||||
|
||||
|
||||
class PostConvNet(dg.Layer):
|
||||
def __init__(self,
|
||||
def __init__(self,
|
||||
n_mels=80,
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
|
@ -16,49 +30,66 @@ class PostConvNet(dg.Layer):
|
|||
dropout=0.1,
|
||||
batchnorm_last=False):
|
||||
super(PostConvNet, self).__init__()
|
||||
|
||||
|
||||
self.dropout = dropout
|
||||
self.num_conv = num_conv
|
||||
self.batchnorm_last = batchnorm_last
|
||||
self.conv_list = []
|
||||
k = math.sqrt(1 / (n_mels * outputs_per_step))
|
||||
self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
|
||||
num_filters = num_hidden,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn))
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=n_mels * outputs_per_step,
|
||||
num_filters=num_hidden,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
k = math.sqrt(1 / num_hidden)
|
||||
for _ in range(1, num_conv-1):
|
||||
self.conv_list.append(Conv1D(num_channels = num_hidden,
|
||||
num_filters = num_hidden,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn))
|
||||
for _ in range(1, num_conv - 1):
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=num_hidden,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
self.conv_list.append(Conv1D(num_channels = num_hidden,
|
||||
num_filters = n_mels * outputs_per_step,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn))
|
||||
self.conv_list.append(
|
||||
Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=n_mels * outputs_per_step,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||
data_layout='NCHW') for _ in range(num_conv-1)]
|
||||
self.batch_norm_list = [
|
||||
dg.BatchNorm(
|
||||
num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
|
||||
]
|
||||
if self.batchnorm_last:
|
||||
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
|
||||
data_layout='NCHW'))
|
||||
self.batch_norm_list.append(
|
||||
dg.BatchNorm(
|
||||
n_mels * outputs_per_step, data_layout='NCHW'))
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
|
@ -69,18 +100,19 @@ class PostConvNet(dg.Layer):
|
|||
Returns:
|
||||
output (Variable), Shape(B, T, C), the result after postconvnet.
|
||||
"""
|
||||
|
||||
input = layers.transpose(input, [0,2,1])
|
||||
|
||||
input = layers.transpose(input, [0, 2, 1])
|
||||
len = input.shape[-1]
|
||||
for i in range(self.num_conv-1):
|
||||
for i in range(self.num_conv - 1):
|
||||
batch_norm = self.batch_norm_list[i]
|
||||
conv = self.conv_list[i]
|
||||
|
||||
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
|
||||
conv = self.conv_list[self.num_conv-1]
|
||||
input = conv(input)[:,:,:len]
|
||||
|
||||
input = layers.dropout(
|
||||
layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
|
||||
conv = self.conv_list[self.num_conv - 1]
|
||||
input = conv(input)[:, :, :len]
|
||||
if self.batchnorm_last:
|
||||
batch_norm = self.batch_norm_list[self.num_conv-1]
|
||||
batch_norm = self.batch_norm_list[self.num_conv - 1]
|
||||
input = layers.dropout(batch_norm(input), self.dropout)
|
||||
output = layers.transpose(input, [0,2,1])
|
||||
return output
|
||||
output = layers.transpose(input, [0, 2, 1])
|
||||
return output
|
||||
|
|
|
@ -1,8 +1,22 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
class PreNet(dg.Layer):
|
||||
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
|
||||
"""
|
||||
|
@ -17,13 +31,21 @@ class PreNet(dg.Layer):
|
|||
self.dropout_rate = dropout_rate
|
||||
|
||||
k = math.sqrt(1 / input_size)
|
||||
self.linear1 = dg.Linear(input_size, hidden_size,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.linear1 = dg.Linear(
|
||||
input_size,
|
||||
hidden_size,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
k = math.sqrt(1 / hidden_size)
|
||||
self.linear2 = dg.Linear(hidden_size, output_size,
|
||||
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
|
||||
self.linear2 = dg.Linear(
|
||||
hidden_size,
|
||||
output_size,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
|
|
|
@ -1,8 +1,22 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.models.transformer_tts.encoder import Encoder
|
||||
from parakeet.models.transformer_tts.decoder import Decoder
|
||||
|
||||
|
||||
class TransformerTTS(dg.Layer):
|
||||
def __init__(self, config):
|
||||
super(TransformerTTS, self).__init__()
|
||||
|
@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer):
|
|||
self.config = config
|
||||
|
||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
||||
|
||||
|
||||
key, c_mask, attns_enc = self.encoder(characters, pos_text)
|
||||
|
||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
|
||||
|
||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
|
||||
key, key, mel_input, c_mask, pos_mel)
|
||||
|
||||
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
import librosa
|
||||
import os, copy
|
||||
|
@ -6,14 +19,15 @@ import paddle.fluid.layers as layers
|
|||
|
||||
|
||||
def get_positional_table(d_pos_vec, n_position=1024):
|
||||
position_enc = np.array([
|
||||
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
|
||||
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
||||
position_enc = np.array(
|
||||
[[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
|
||||
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
||||
|
||||
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
||||
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
||||
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
||||
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
||||
return position_enc
|
||||
|
||||
|
||||
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
||||
''' Sinusoid position encoding table '''
|
||||
|
||||
|
@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
|||
def get_posi_angle_vec(position):
|
||||
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
|
||||
|
||||
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
sinusoid_table = np.array(
|
||||
[get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
|
@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
|||
|
||||
return sinusoid_table
|
||||
|
||||
|
||||
def get_non_pad_mask(seq):
|
||||
return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
|
||||
return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
|
||||
|
||||
|
||||
def get_attn_key_pad_mask(seq_k, seq_q):
|
||||
''' For masking out the padding part of key sequence. '''
|
||||
|
@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
|
|||
# Expand to fit the shape of key query attention matrix.
|
||||
len_q = seq_q.shape[1]
|
||||
padding_mask = (seq_k != 0).astype(np.float32)
|
||||
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1])
|
||||
padding_mask = layers.expand(
|
||||
layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
|
||||
return padding_mask
|
||||
|
||||
|
||||
def get_triu_tensor(seq_k, seq_q):
|
||||
''' For make a triu tensor '''
|
||||
len_k = seq_k.shape[1]
|
||||
len_q = seq_q.shape[1]
|
||||
batch_size = seq_k.shape[0]
|
||||
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
|
||||
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
|
||||
|
||||
triu_tensor = np.repeat(
|
||||
np.expand_dims(
|
||||
triu_tensor, axis=0), batch_size, axis=0)
|
||||
|
||||
return triu_tensor
|
||||
|
||||
|
||||
def guided_attention(N, T, g=0.2):
|
||||
'''Guided attention. Refer to page 3 on the paper.'''
|
||||
W = np.zeros((N, T), dtype=np.float32)
|
||||
for n_pos in range(W.shape[0]):
|
||||
for t_pos in range(W.shape[1]):
|
||||
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
|
||||
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
|
||||
**2 / (2 * g * g))
|
||||
return W
|
||||
|
||||
|
||||
def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
|
||||
output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
|
||||
output = -1 * label * layers.log(input + epsilon) - (
|
||||
1 - label) * layers.log(1 - input + epsilon)
|
||||
output = output * (label * (position_weight - 1) + 1)
|
||||
|
||||
return layers.reduce_sum(output, dim=[0, 1])
|
||||
|
||||
|
||||
|
|
|
@ -1,27 +1,44 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.customized import Conv1D
|
||||
from parakeet.models.transformer_tts.utils import *
|
||||
from parakeet.models.transformer_tts.cbhg import CBHG
|
||||
|
||||
|
||||
class Vocoder(dg.Layer):
|
||||
"""
|
||||
CBHG Network (mel -> linear)
|
||||
"""
|
||||
|
||||
def __init__(self, config, batch_size):
|
||||
super(Vocoder, self).__init__()
|
||||
self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'],
|
||||
num_filters = config['hidden_size'],
|
||||
filter_size=1)
|
||||
self.pre_proj = Conv1D(
|
||||
num_channels=config['audio']['num_mels'],
|
||||
num_filters=config['hidden_size'],
|
||||
filter_size=1)
|
||||
self.cbhg = CBHG(config['hidden_size'], batch_size)
|
||||
self.post_proj = Conv1D(num_channels = config['hidden_size'],
|
||||
num_filters = (config['audio']['n_fft'] // 2) + 1,
|
||||
filter_size=1)
|
||||
self.post_proj = Conv1D(
|
||||
num_channels=config['hidden_size'],
|
||||
num_filters=(config['audio']['n_fft'] // 2) + 1,
|
||||
filter_size=1)
|
||||
|
||||
def forward(self, mel):
|
||||
mel = layers.transpose(mel, [0,2,1])
|
||||
mel = layers.transpose(mel, [0, 2, 1])
|
||||
mel = self.pre_proj(mel)
|
||||
mel = self.cbhg(mel)
|
||||
mag_pred = self.post_proj(mel)
|
||||
mag_pred = layers.transpose(mag_pred, [0,2,1])
|
||||
mag_pred = layers.transpose(mag_pred, [0, 2, 1])
|
||||
return mag_pred
|
||||
|
|
|
@ -1 +1,15 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.models.waveflow.waveflow import WaveFlow
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
|
||||
import librosa
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms.
|
||||
WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499).
|
||||
Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels.
|
||||
Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels.
|
||||
|
||||
We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
|
||||
|
||||
|
@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \
|
|||
#### Save and Load checkpoints
|
||||
|
||||
Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default.
|
||||
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
|
||||
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
|
||||
|
||||
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
|
||||
1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
|
||||
1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
|
||||
2. Use `--iteration=500000`.
|
||||
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`.
|
||||
|
||||
|
@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \
|
|||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --use_gpu=true \
|
||||
--output=./syn_audios \
|
||||
--sample=${SAMPLE}
|
||||
--sample=${SAMPLE}
|
||||
```
|
||||
|
||||
In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
|
||||
import librosa
|
||||
|
@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
|
|||
self.fft_window_shift = config.fft_window_shift
|
||||
# Calculate context frames.
|
||||
frames_per_second = config.sample_rate // self.fft_window_shift
|
||||
train_clip_frames = int(np.ceil(
|
||||
config.train_clip_second * frames_per_second))
|
||||
train_clip_frames = int(
|
||||
np.ceil(config.train_clip_second * frames_per_second))
|
||||
context_frames = config.context_size // self.fft_window_shift
|
||||
self.num_frames = train_clip_frames + context_frames
|
||||
|
||||
|
@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech):
|
|||
fft_window_shift = config.fft_window_shift
|
||||
fft_window_size = config.fft_window_size
|
||||
fft_size = config.fft_size
|
||||
|
||||
|
||||
audio, loaded_sr = librosa.load(wav_path, sr=None)
|
||||
assert loaded_sr == sr
|
||||
|
||||
|
@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech):
|
|||
fft_padding = (fft_size - fft_window_shift) // 2
|
||||
desired_length = frames * fft_window_shift + fft_padding * 2
|
||||
pad_amount = (desired_length - audio.size) // 2
|
||||
|
||||
|
||||
if audio.size % 2 == 0:
|
||||
audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect')
|
||||
else:
|
||||
audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect')
|
||||
|
||||
|
||||
# Normalize audio.
|
||||
audio = audio / np.abs(audio).max() * 0.999
|
||||
|
||||
|
||||
# Compute mel-spectrogram.
|
||||
# Turn center to False to prevent internal padding.
|
||||
spectrogram = librosa.core.stft(
|
||||
audio, hop_length=fft_window_shift,
|
||||
win_length=fft_window_size, n_fft=fft_size, center=False)
|
||||
audio,
|
||||
hop_length=fft_window_shift,
|
||||
win_length=fft_window_size,
|
||||
n_fft=fft_size,
|
||||
center=False)
|
||||
spectrogram_magnitude = np.abs(spectrogram)
|
||||
|
||||
|
||||
# Compute mel-spectrograms.
|
||||
mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size,
|
||||
mel_filter_bank = librosa.filters.mel(sr=sr,
|
||||
n_fft=fft_size,
|
||||
n_mels=config.mel_bands)
|
||||
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
|
||||
mel_spectrogram = mel_spectrogram.T
|
||||
|
||||
|
||||
# Rescale mel_spectrogram.
|
||||
min_level, ref_level = 1e-5, 20
|
||||
mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
|
||||
mel_spectrogram = mel_spectrogram - ref_level
|
||||
mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
|
||||
|
||||
|
||||
# Extract the center of audio that corresponds to mel spectrograms.
|
||||
audio = audio[fft_padding : -fft_padding]
|
||||
audio = audio[fft_padding:-fft_padding]
|
||||
assert mel_spectrogram.shape[0] * fft_window_shift == audio.size
|
||||
|
||||
return audio, mel_spectrogram
|
||||
|
||||
|
||||
class Subset(dataset.Dataset):
|
||||
class Subset(dataset.Dataset):
|
||||
def __init__(self, dataset, indices, valid):
|
||||
self.dataset = dataset
|
||||
self.indices = indices
|
||||
|
@ -100,23 +118,23 @@ class Subset(dataset.Dataset):
|
|||
|
||||
audio_start = frame_start * fft_window_shift
|
||||
audio_end = frame_end * fft_window_shift
|
||||
|
||||
audio = audio[audio_start : audio_end]
|
||||
|
||||
audio = audio[audio_start:audio_end]
|
||||
|
||||
return audio, mel, audio_start
|
||||
|
||||
def _batch_examples(self, batch):
|
||||
audios = [sample[0] for sample in batch]
|
||||
audio_starts = [sample[2] for sample in batch]
|
||||
|
||||
|
||||
# mels shape [num_frames, mel_bands]
|
||||
max_frames = max(sample[1].shape[0] for sample in batch)
|
||||
max_frames = max(sample[1].shape[0] for sample in batch)
|
||||
mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch]
|
||||
|
||||
|
||||
audios = np.array(audios, dtype=np.float32)
|
||||
mels = np.array(mels, dtype=np.float32)
|
||||
audio_starts = np.array(audio_starts, dtype=np.int32)
|
||||
|
||||
|
||||
return audios, mels, audio_starts
|
||||
|
||||
def __len__(self):
|
||||
|
@ -138,17 +156,17 @@ class LJSpeech:
|
|||
|
||||
# Train dataset.
|
||||
trainset = Subset(ds, train_indices, valid=False)
|
||||
sampler = DistributedSampler(len(trainset), nranks, rank)
|
||||
sampler = DistributedSampler(len(trainset), nranks, rank)
|
||||
total_bs = config.batch_size
|
||||
assert total_bs % nranks == 0
|
||||
train_sampler = BatchSampler(sampler, total_bs // nranks,
|
||||
drop_last=True)
|
||||
train_sampler = BatchSampler(
|
||||
sampler, total_bs // nranks, drop_last=True)
|
||||
trainloader = DataCargo(trainset, batch_sampler=train_sampler)
|
||||
|
||||
trainreader = fluid.io.PyReader(capacity=50, return_list=True)
|
||||
trainreader.decorate_batch_generator(trainloader, place)
|
||||
self.trainloader = (data for _ in iter(int, 1)
|
||||
for data in trainreader())
|
||||
for data in trainreader())
|
||||
|
||||
# Valid dataset.
|
||||
validset = Subset(ds, valid_indices, valid=True)
|
||||
|
@ -156,5 +174,5 @@ class LJSpeech:
|
|||
validloader = DataCargo(validset, batch_size=1, shuffle=False)
|
||||
|
||||
validreader = fluid.io.PyReader(capacity=20, return_list=True)
|
||||
validreader.decorate_batch_generator(validloader, place)
|
||||
validreader.decorate_batch_generator(validloader, place)
|
||||
self.validloader = validreader
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Utility module for restarting training when using SLURM.
|
||||
"""
|
||||
|
@ -45,8 +58,8 @@ def parse_time(text):
|
|||
try:
|
||||
return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
|
||||
except ValueError as e:
|
||||
raise ValueError("Error parsing time {}. Got error {}.".format(
|
||||
text, str(e)))
|
||||
raise ValueError("Error parsing time {}. Got error {}.".format(text,
|
||||
str(e)))
|
||||
|
||||
|
||||
def restart_command():
|
||||
|
@ -76,8 +89,10 @@ def restart_command():
|
|||
gres, partition = info.get("Gres"), info.get("Partition")
|
||||
stderr, stdout = info.get("StdErr"), info.get("StdOut")
|
||||
job_name = info.get("JobName")
|
||||
command = ["sbatch", "--job-name={}".format(job_name),
|
||||
"--ntasks={}".format(num_tasks)]
|
||||
command = [
|
||||
"sbatch", "--job-name={}".format(job_name),
|
||||
"--ntasks={}".format(num_tasks)
|
||||
]
|
||||
|
||||
if partition:
|
||||
command.extend(["--partition", partition])
|
||||
|
@ -98,12 +113,13 @@ def restart_command():
|
|||
dist_setting = ['-m', 'paddle.distributed.launch']
|
||||
wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv
|
||||
|
||||
command.append(
|
||||
"--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd)))
|
||||
command.append("--wrap={}".format(" ".join(
|
||||
shlex.quote(arg) for arg in wrap_cmd)))
|
||||
time_limit_string = info["TimeLimit"]
|
||||
if time_limit_string.lower() == "unlimited":
|
||||
print("UNLIMITED detected: restart OFF, infinite learning ON.",
|
||||
flush=True)
|
||||
print(
|
||||
"UNLIMITED detected: restart OFF, infinite learning ON.",
|
||||
flush=True)
|
||||
return command, None
|
||||
time_limit = parse_time(time_limit_string)
|
||||
runtime = parse_time(info["RunTime"])
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
from pprint import pprint
|
||||
|
@ -12,25 +26,42 @@ from wavenet import WaveNet
|
|||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
parser.add_argument('--model', type=str, default='wavenet',
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
type=str,
|
||||
default='wavenet',
|
||||
help="general name of the model")
|
||||
parser.add_argument('--name', type=str,
|
||||
help="specific name of the training model")
|
||||
parser.add_argument('--root', type=str,
|
||||
help="root path of the LJSpeech dataset")
|
||||
parser.add_argument(
|
||||
'--name', type=str, help="specific name of the training model")
|
||||
parser.add_argument(
|
||||
'--root', type=str, help="root path of the LJSpeech dataset")
|
||||
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
parser.add_argument(
|
||||
'--use_gpu',
|
||||
type=bool,
|
||||
default=True,
|
||||
help="option to use gpu training")
|
||||
|
||||
parser.add_argument('--iteration', type=int, default=None,
|
||||
parser.add_argument(
|
||||
'--iteration',
|
||||
type=int,
|
||||
default=None,
|
||||
help=("which iteration of checkpoint to load, "
|
||||
"default to load the latest checkpoint"))
|
||||
parser.add_argument('--checkpoint', type=str, default=None,
|
||||
parser.add_argument(
|
||||
'--checkpoint',
|
||||
type=str,
|
||||
default=None,
|
||||
help="path of the checkpoint to load")
|
||||
|
||||
parser.add_argument('--output', type=str, default="./syn_audios",
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
type=str,
|
||||
default="./syn_audios",
|
||||
help="path to write synthesized audio files")
|
||||
parser.add_argument('--sample', type=int,
|
||||
parser.add_argument(
|
||||
'--sample',
|
||||
type=int,
|
||||
help="which of the valid samples to synthesize audio")
|
||||
|
||||
|
||||
|
@ -52,7 +83,7 @@ def synthesize(config):
|
|||
fluid.default_startup_program().random_seed = seed
|
||||
fluid.default_main_program().random_seed = seed
|
||||
print("Random Seed: ", seed)
|
||||
|
||||
|
||||
# Build model.
|
||||
model = WaveNet(config, checkpoint_dir)
|
||||
model.build(training=False)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
|
@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60
|
|||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
parser.add_argument('--model', type=str, default='wavenet',
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
type=str,
|
||||
default='wavenet',
|
||||
help="general name of the model")
|
||||
parser.add_argument('--name', type=str,
|
||||
help="specific name of the training model")
|
||||
parser.add_argument('--root', type=str,
|
||||
help="root path of the LJSpeech dataset")
|
||||
parser.add_argument(
|
||||
'--name', type=str, help="specific name of the training model")
|
||||
parser.add_argument(
|
||||
'--root', type=str, help="root path of the LJSpeech dataset")
|
||||
|
||||
parser.add_argument('--parallel', type=bool, default=True,
|
||||
parser.add_argument(
|
||||
'--parallel',
|
||||
type=bool,
|
||||
default=True,
|
||||
help="option to use data parallel training")
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
parser.add_argument(
|
||||
'--use_gpu',
|
||||
type=bool,
|
||||
default=True,
|
||||
help="option to use gpu training")
|
||||
|
||||
parser.add_argument('--iteration', type=int, default=None,
|
||||
parser.add_argument(
|
||||
'--iteration',
|
||||
type=int,
|
||||
default=None,
|
||||
help=("which iteration of checkpoint to load, "
|
||||
"default to load the latest checkpoint"))
|
||||
parser.add_argument('--checkpoint', type=str, default=None,
|
||||
parser.add_argument(
|
||||
'--checkpoint',
|
||||
type=str,
|
||||
default=None,
|
||||
help="path of the checkpoint to load")
|
||||
parser.add_argument('--slurm', type=bool, default=False,
|
||||
parser.add_argument(
|
||||
'--slurm',
|
||||
type=bool,
|
||||
default=False,
|
||||
help="whether you are using slurm to submit training jobs")
|
||||
|
||||
|
||||
|
@ -104,8 +136,8 @@ def train(config):
|
|||
|
||||
# Check whether reaching the time limit.
|
||||
if config.slurm:
|
||||
done = (death_time is not None and death_time - time.time() <
|
||||
MAXIMUM_SAVE_TIME)
|
||||
done = (death_time is not None and
|
||||
death_time - time.time() < MAXIMUM_SAVE_TIME)
|
||||
|
||||
if rank == 0 and done:
|
||||
print("Saving progress before exiting.")
|
||||
|
@ -127,8 +159,8 @@ def train(config):
|
|||
|
||||
if __name__ == "__main__":
|
||||
# Create parser.
|
||||
parser = jsonargparse.ArgumentParser(description="Train WaveNet model",
|
||||
formatter_class='default_argparse')
|
||||
parser = jsonargparse.ArgumentParser(
|
||||
description="Train WaveNet model", formatter_class='default_argparse')
|
||||
add_options_to_parser(parser)
|
||||
utils.add_config_options_to_parser(parser)
|
||||
|
||||
|
@ -136,4 +168,4 @@ if __name__ == "__main__":
|
|||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
train(config)
|
||||
train(config)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg
|
|||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--valid_size', type=int,
|
||||
help="size of the valid dataset")
|
||||
parser.add_argument('--train_clip_second', type=float,
|
||||
parser.add_argument(
|
||||
'--valid_size', type=int, help="size of the valid dataset")
|
||||
parser.add_argument(
|
||||
'--train_clip_second',
|
||||
type=float,
|
||||
help="the length of audio clip for training")
|
||||
parser.add_argument('--sample_rate', type=int,
|
||||
help="sampling rate of audio data file")
|
||||
parser.add_argument('--fft_window_shift', type=int,
|
||||
parser.add_argument(
|
||||
'--sample_rate', type=int, help="sampling rate of audio data file")
|
||||
parser.add_argument(
|
||||
'--fft_window_shift',
|
||||
type=int,
|
||||
help="the shift of fft window for each frame")
|
||||
parser.add_argument('--fft_window_size', type=int,
|
||||
parser.add_argument(
|
||||
'--fft_window_size',
|
||||
type=int,
|
||||
help="the size of fft window for each frame")
|
||||
parser.add_argument('--fft_size', type=int,
|
||||
help="the size of fft filter on each frame")
|
||||
parser.add_argument('--mel_bands', type=int,
|
||||
parser.add_argument(
|
||||
'--fft_size', type=int, help="the size of fft filter on each frame")
|
||||
parser.add_argument(
|
||||
'--mel_bands',
|
||||
type=int,
|
||||
help="the number of mel bands when calculating mel spectrograms")
|
||||
|
||||
parser.add_argument('--seed', type=int,
|
||||
help="seed of random initialization for the model")
|
||||
parser.add_argument('--batch_size', type=int,
|
||||
help="batch size for training")
|
||||
parser.add_argument('--test_every', type=int,
|
||||
help="test interval during training")
|
||||
parser.add_argument('--save_every', type=int,
|
||||
parser.add_argument(
|
||||
'--seed', type=int, help="seed of random initialization for the model")
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, help="batch size for training")
|
||||
parser.add_argument(
|
||||
'--test_every', type=int, help="test interval during training")
|
||||
parser.add_argument(
|
||||
'--save_every',
|
||||
type=int,
|
||||
help="checkpointing interval during training")
|
||||
parser.add_argument('--max_iterations', type=int,
|
||||
help="maximum training iterations")
|
||||
parser.add_argument(
|
||||
'--max_iterations', type=int, help="maximum training iterations")
|
||||
|
||||
parser.add_argument('--layers', type=int,
|
||||
help="number of dilated convolution layers")
|
||||
parser.add_argument('--kernel_width', type=int,
|
||||
help="dilated convolution kernel width")
|
||||
parser.add_argument('--dilation_block', type=list,
|
||||
help="dilated convolution kernel width")
|
||||
parser.add_argument(
|
||||
'--layers', type=int, help="number of dilated convolution layers")
|
||||
parser.add_argument(
|
||||
'--kernel_width', type=int, help="dilated convolution kernel width")
|
||||
parser.add_argument(
|
||||
'--dilation_block', type=list, help="dilated convolution kernel width")
|
||||
parser.add_argument('--residual_channels', type=int)
|
||||
parser.add_argument('--skip_channels', type=int)
|
||||
parser.add_argument('--loss_type', type=str,
|
||||
help="mix-gaussian-pdf or softmax")
|
||||
parser.add_argument('--num_channels', type=int, default=None,
|
||||
parser.add_argument(
|
||||
'--loss_type', type=str, help="mix-gaussian-pdf or softmax")
|
||||
parser.add_argument(
|
||||
'--num_channels',
|
||||
type=int,
|
||||
default=None,
|
||||
help="number of channels for softmax output")
|
||||
parser.add_argument('--num_mixtures', type=int, default=None,
|
||||
parser.add_argument(
|
||||
'--num_mixtures',
|
||||
type=int,
|
||||
default=None,
|
||||
help="number of gaussian mixtures for gaussian output")
|
||||
parser.add_argument('--log_scale_min', type=float, default=None,
|
||||
parser.add_argument(
|
||||
'--log_scale_min',
|
||||
type=float,
|
||||
default=None,
|
||||
help="minimum clip value of log variance of gaussian output")
|
||||
|
||||
parser.add_argument('--conditioner.filter_sizes', type=list,
|
||||
parser.add_argument(
|
||||
'--conditioner.filter_sizes',
|
||||
type=list,
|
||||
help="conv2d tranpose op filter sizes for building conditioner")
|
||||
parser.add_argument('--conditioner.upsample_factors', type=list,
|
||||
parser.add_argument(
|
||||
'--conditioner.upsample_factors',
|
||||
type=list,
|
||||
help="list of upsample factors for building conditioner")
|
||||
|
||||
parser.add_argument('--learning_rate', type=float)
|
||||
parser.add_argument('--gradient_max_norm', type=float)
|
||||
parser.add_argument('--anneal.every', type=int,
|
||||
parser.add_argument(
|
||||
'--anneal.every',
|
||||
type=int,
|
||||
help="step interval for annealing learning rate")
|
||||
parser.add_argument('--anneal.rate', type=float)
|
||||
|
||||
|
@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
|
|||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||
|
||||
|
||||
def load_parameters(checkpoint_dir, rank, model, optimizer=None,
|
||||
iteration=None, file_path=None):
|
||||
def load_parameters(checkpoint_dir,
|
||||
rank,
|
||||
model,
|
||||
optimizer=None,
|
||||
iteration=None,
|
||||
file_path=None):
|
||||
if file_path is None:
|
||||
if iteration is None:
|
||||
iteration = load_latest_checkpoint(checkpoint_dir, rank)
|
||||
|
@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
|
|||
if optimizer and optimizer_dict:
|
||||
optimizer.set_dict(optimizer_dict)
|
||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
||||
rank, file_path))
|
||||
rank, file_path))
|
||||
|
||||
|
||||
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule
|
|||
|
||||
|
||||
class WaveNet():
|
||||
def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
|
||||
nranks=1, tb_logger=None):
|
||||
def __init__(self,
|
||||
config,
|
||||
checkpoint_dir,
|
||||
parallel=False,
|
||||
rank=0,
|
||||
nranks=1,
|
||||
tb_logger=None):
|
||||
# Process config to calculate the context size
|
||||
dilations = list(
|
||||
itertools.islice(
|
||||
|
@ -29,12 +48,12 @@ class WaveNet():
|
|||
|
||||
def build(self, training=True):
|
||||
config = self.config
|
||||
dataset = LJSpeech(config, self.nranks, self.rank)
|
||||
dataset = LJSpeech(config, self.nranks, self.rank)
|
||||
self.trainloader = dataset.trainloader
|
||||
self.validloader = dataset.validloader
|
||||
|
||||
wavenet = WaveNetModule("wavenet", config, self.rank)
|
||||
|
||||
|
||||
# Dry run once to create and initalize all necessary parameters.
|
||||
audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32))
|
||||
mel = dg.to_variable(
|
||||
|
@ -45,38 +64,44 @@ class WaveNet():
|
|||
if training:
|
||||
# Create Learning rate scheduler.
|
||||
lr_scheduler = dg.ExponentialDecay(
|
||||
learning_rate = config.learning_rate,
|
||||
decay_steps = config.anneal.every,
|
||||
decay_rate = config.anneal.rate,
|
||||
learning_rate=config.learning_rate,
|
||||
decay_steps=config.anneal.every,
|
||||
decay_rate=config.anneal.rate,
|
||||
staircase=True)
|
||||
|
||||
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=lr_scheduler)
|
||||
|
||||
|
||||
clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
|
||||
config.gradient_max_norm)
|
||||
|
||||
# Load parameters.
|
||||
utils.load_parameters(self.checkpoint_dir, self.rank,
|
||||
wavenet, optimizer,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
utils.load_parameters(
|
||||
self.checkpoint_dir,
|
||||
self.rank,
|
||||
wavenet,
|
||||
optimizer,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||
|
||||
|
||||
# Data parallelism.
|
||||
if self.parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
wavenet = dg.parallel.DataParallel(wavenet, strategy)
|
||||
|
||||
|
||||
self.wavenet = wavenet
|
||||
self.optimizer = optimizer
|
||||
self.clipper = clipper
|
||||
|
||||
else:
|
||||
# Load parameters.
|
||||
utils.load_parameters(self.checkpoint_dir, self.rank, wavenet,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
utils.load_parameters(
|
||||
self.checkpoint_dir,
|
||||
self.rank,
|
||||
wavenet,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||
|
||||
self.wavenet = wavenet
|
||||
|
@ -104,7 +129,9 @@ class WaveNet():
|
|||
else:
|
||||
current_lr = self.optimizer._learning_rate
|
||||
|
||||
self.optimizer.minimize(loss, grad_clip=self.clipper,
|
||||
self.optimizer.minimize(
|
||||
loss,
|
||||
grad_clip=self.clipper,
|
||||
parameter_list=self.wavenet.parameters())
|
||||
self.wavenet.clear_gradients()
|
||||
|
||||
|
@ -143,10 +170,16 @@ class WaveNet():
|
|||
|
||||
tb = self.tb_logger
|
||||
tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
|
||||
tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(),
|
||||
iteration, sample_rate=self.config.sample_rate)
|
||||
tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
|
||||
iteration, sample_rate=self.config.sample_rate)
|
||||
tb.add_audio(
|
||||
"Teacher-Forced-Audio-0",
|
||||
sample_audios[0].numpy(),
|
||||
iteration,
|
||||
sample_rate=self.config.sample_rate)
|
||||
tb.add_audio(
|
||||
"Teacher-Forced-Audio-1",
|
||||
sample_audios[1].numpy(),
|
||||
iteration,
|
||||
sample_rate=self.config.sample_rate)
|
||||
|
||||
@dg.no_grad
|
||||
def infer(self, iteration):
|
||||
|
@ -165,10 +198,9 @@ class WaveNet():
|
|||
start_time = time.time()
|
||||
syn_audio = self.wavenet.synthesize(mels_list[sample])
|
||||
syn_time = time.time() - start_time
|
||||
print("audio shape {}, synthesis time {}".format(
|
||||
syn_audio.shape, syn_time))
|
||||
librosa.output.write_wav(filename, syn_audio,
|
||||
sr=config.sample_rate)
|
||||
print("audio shape {}, synthesis time {}".format(syn_audio.shape,
|
||||
syn_time))
|
||||
librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate)
|
||||
|
||||
def save(self, iteration):
|
||||
utils.save_latest_parameters(self.checkpoint_dir, iteration,
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
|
@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'):
|
|||
|
||||
def extract_slices(x, audio_starts, audio_length, rank):
|
||||
slices = []
|
||||
for i in range(x.shape[0]):
|
||||
for i in range(x.shape[0]):
|
||||
start = audio_starts.numpy()[i]
|
||||
end = start + audio_length
|
||||
slice = fluid.layers.slice(
|
||||
x, axes=[0, 1], starts=[i, start], ends=[i+1, end])
|
||||
x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
|
||||
slices.append(fluid.layers.squeeze(slice, [0]))
|
||||
|
||||
x = fluid.layers.stack(slices, axis=0)
|
||||
|
@ -50,7 +64,7 @@ class Conditioner(dg.Layer):
|
|||
# Register python list as parameters.
|
||||
for i, layer in enumerate(self.deconvs):
|
||||
self.add_sublayer("conv_transpose_{}".format(i), layer)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = fluid.layers.unsqueeze(x, 1)
|
||||
for layer in self.deconvs:
|
||||
|
@ -62,7 +76,7 @@ class Conditioner(dg.Layer):
|
|||
class WaveNetModule(dg.Layer):
|
||||
def __init__(self, name_scope, config, rank):
|
||||
super(WaveNetModule, self).__init__(name_scope)
|
||||
|
||||
|
||||
self.rank = rank
|
||||
self.conditioner = Conditioner(self.full_name(), config)
|
||||
self.dilations = list(
|
||||
|
@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer):
|
|||
embed_dim=config.residual_channels,
|
||||
std=0.1)
|
||||
elif config.loss_type == "mix-gaussian-pdf":
|
||||
self.embedding_fc = modules.FC(
|
||||
self.full_name(),
|
||||
in_features=1,
|
||||
size=config.residual_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=False)
|
||||
self.embedding_fc = modules.FC(self.full_name(),
|
||||
in_features=1,
|
||||
size=config.residual_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=False)
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(loss_type))
|
||||
|
||||
self.dilated_causal_convs = []
|
||||
for dilation in self.dilations:
|
||||
|
@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer):
|
|||
num_filters=config.residual_channels,
|
||||
filter_size=config.kernel_width,
|
||||
dilation=dilation,
|
||||
causal=True
|
||||
)
|
||||
)
|
||||
causal=True))
|
||||
|
||||
for i, layer in enumerate(self.dilated_causal_convs):
|
||||
self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
|
||||
self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
|
||||
|
||||
self.fc1 = modules.FC(
|
||||
self.full_name(),
|
||||
in_features=config.residual_channels,
|
||||
size=config.skip_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=True,
|
||||
act="relu")
|
||||
self.fc1 = modules.FC(self.full_name(),
|
||||
in_features=config.residual_channels,
|
||||
size=config.skip_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=True,
|
||||
act="relu")
|
||||
|
||||
self.fc2 = modules.FC(
|
||||
self.full_name(),
|
||||
in_features=config.skip_channels,
|
||||
size=config.skip_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=True,
|
||||
act="relu")
|
||||
self.fc2 = modules.FC(self.full_name(),
|
||||
in_features=config.skip_channels,
|
||||
size=config.skip_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=True,
|
||||
act="relu")
|
||||
|
||||
if config.loss_type == "softmax":
|
||||
self.fc3 = modules.FC(
|
||||
self.full_name(),
|
||||
in_features=config.skip_channels,
|
||||
size=config.num_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=False)
|
||||
self.fc3 = modules.FC(self.full_name(),
|
||||
in_features=config.skip_channels,
|
||||
size=config.num_channels,
|
||||
num_flatten_dims=2,
|
||||
relu=False)
|
||||
elif config.loss_type == "mix-gaussian-pdf":
|
||||
self.fc3 = modules.FC(
|
||||
self.full_name(),
|
||||
in_features=config.skip_channels,
|
||||
size=3 * config.num_mixtures,
|
||||
num_flatten_dims=2,
|
||||
relu=False)
|
||||
self.fc3 = modules.FC(self.full_name(),
|
||||
in_features=config.skip_channels,
|
||||
size=3 * config.num_mixtures,
|
||||
num_flatten_dims=2,
|
||||
relu=False)
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(loss_type))
|
||||
|
||||
def sample_softmax(self, mix_parameters):
|
||||
batch, length, hidden = mix_parameters.shape
|
||||
mix_param_2d = fluid.layers.reshape(mix_parameters,
|
||||
[batch * length, hidden])
|
||||
[batch * length, hidden])
|
||||
mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)
|
||||
|
||||
# quantized: [batch * length]
|
||||
quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),
|
||||
dtype="float32")
|
||||
quantized = fluid.layers.cast(
|
||||
fluid.layers.sampling_id(mix_param_2d), dtype="float32")
|
||||
samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0
|
||||
|
||||
# samples: [batch * length]
|
||||
|
@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer):
|
|||
# to [bs * len, 3 * num_mixtures].
|
||||
batch, length, hidden = mix_parameters.shape
|
||||
mix_param_2d = fluid.layers.reshape(mix_parameters,
|
||||
[batch * length, hidden])
|
||||
[batch * length, hidden])
|
||||
K = hidden // 3
|
||||
|
||||
# Unpack the parameters of the mixture of gaussian.
|
||||
logits_pi = mix_param_2d[:, 0 : K]
|
||||
mu = mix_param_2d[:, K : 2*K]
|
||||
log_s = mix_param_2d[:, 2*K : 3*K]
|
||||
logits_pi = mix_param_2d[:, 0:K]
|
||||
mu = mix_param_2d[:, K:2 * K]
|
||||
log_s = mix_param_2d[:, 2 * K:3 * K]
|
||||
s = fluid.layers.exp(log_s)
|
||||
|
||||
pi = fluid.layers.softmax(logits_pi, axis=-1)
|
||||
comp_samples = fluid.layers.sampling_id(pi)
|
||||
|
||||
|
||||
row_idx = dg.to_variable(np.arange(batch * length))
|
||||
comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)
|
||||
|
||||
mu_comp = fluid.layers.gather_nd(mu, comp_samples)
|
||||
s_comp = fluid.layers.gather_nd(s, comp_samples)
|
||||
s_comp = fluid.layers.gather_nd(s, comp_samples)
|
||||
|
||||
# N(0, 1) normal sample.
|
||||
u = fluid.layers.gaussian_random(shape=[batch * length])
|
||||
|
@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer):
|
|||
|
||||
# Calculate gaussian loss.
|
||||
targets = fluid.layers.unsqueeze(targets, -1)
|
||||
targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
|
||||
x_std = inv_s * (targets - mu)
|
||||
targets = fluid.layers.expand(targets,
|
||||
[1, 1, self.config.num_mixtures])
|
||||
x_std = inv_s * (targets - mu)
|
||||
exponent = fluid.layers.exp(-0.5 * x_std * x_std)
|
||||
pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
|
||||
pdf_x = pi * pdf_x
|
||||
|
@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer):
|
|||
|
||||
# Slice conditioners.
|
||||
audio_length = audios.shape[1]
|
||||
conditioner = extract_slices(full_conditioner,
|
||||
audio_starts, audio_length, self.rank)
|
||||
|
||||
conditioner = extract_slices(full_conditioner, audio_starts,
|
||||
audio_length, self.rank)
|
||||
|
||||
# input_audio, target_audio: [bs, len]
|
||||
input_audios = audios[:, :-1]
|
||||
target_audios = audios[:, 1:]
|
||||
|
@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer):
|
|||
layer_input = self.embedding_fc(
|
||||
fluid.layers.unsqueeze(input_audios, 2))
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(loss_type))
|
||||
|
||||
# layer_input: [bs, res_channel, 1, len]
|
||||
layer_input = fluid.layers.unsqueeze(
|
||||
fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
|
||||
fluid.layers.transpose(
|
||||
layer_input, perm=[0, 2, 1]), 2)
|
||||
# conditioner: [bs, mel_bands, 1, len]
|
||||
conditioner = fluid.layers.unsqueeze(
|
||||
fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
|
||||
fluid.layers.transpose(
|
||||
conditioner, perm=[0, 2, 1]), 2)
|
||||
|
||||
skip = None
|
||||
for i, layer in enumerate(self.dilated_causal_convs):
|
||||
|
@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer):
|
|||
elif loss_type == "mix-gaussian-pdf":
|
||||
sample_audios = self.sample_mix_gaussian(mix_parameters)
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(
|
||||
loss_type))
|
||||
|
||||
if loss_type == "softmax":
|
||||
loss = self.softmax_loss(target_audios, mix_parameters)
|
||||
elif loss_type == "mix-gaussian-pdf":
|
||||
loss = self.mixture_density_loss(target_audios,
|
||||
mix_parameters, self.log_scale_min)
|
||||
loss = self.mixture_density_loss(target_audios, mix_parameters,
|
||||
self.log_scale_min)
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(loss_type))
|
||||
|
||||
return loss, sample_audios
|
||||
|
||||
def synthesize(self, mels):
|
||||
self.start_new_sequence()
|
||||
bs, n_frames, mel_bands = mels.shape
|
||||
bs, n_frames, mel_bands = mels.shape
|
||||
conditioner = self.conditioner(mels)
|
||||
time_steps = conditioner.shape[1]
|
||||
|
||||
|
@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer):
|
|||
elif loss_type == "mix-gaussian-pdf":
|
||||
audio_input = self.embedding_fc(current_sample)
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(
|
||||
loss_type))
|
||||
|
||||
# [bs, channel, 1, 1]
|
||||
audio_input = fluid.layers.unsqueeze(
|
||||
fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
|
||||
fluid.layers.transpose(
|
||||
audio_input, perm=[0, 2, 1]), 2)
|
||||
# [bs, mel_bands]
|
||||
cond_input = conditioner[:, i, :]
|
||||
# [bs, mel_bands, 1, 1]
|
||||
cond_input = fluid.layers.reshape(
|
||||
cond_input, cond_input.shape + [1, 1])
|
||||
cond_input = fluid.layers.reshape(cond_input,
|
||||
cond_input.shape + [1, 1])
|
||||
|
||||
skip = None
|
||||
for layer in self.dilated_causal_convs:
|
||||
audio_input, skip = layer.add_input(
|
||||
audio_input, skip, cond_input)
|
||||
|
||||
audio_input, skip = layer.add_input(audio_input, skip,
|
||||
cond_input)
|
||||
|
||||
# [bs, 1, channel]
|
||||
skip = fluid.layers.transpose(
|
||||
fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
|
||||
|
@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer):
|
|||
elif loss_type == "mix-gaussian-pdf":
|
||||
sample = self.sample_mix_gaussian(mix_parameters)
|
||||
else:
|
||||
raise ValueError(
|
||||
"loss_type {} is unsupported!".format(loss_type))
|
||||
raise ValueError("loss_type {} is unsupported!".format(
|
||||
loss_type))
|
||||
audio_samples.append(sample)
|
||||
# [bs]
|
||||
current_sample = audio_samples[-1]
|
||||
# [bs, 1, 1]
|
||||
current_sample = fluid.layers.reshape(current_sample,
|
||||
current_sample.shape + [1, 1])
|
||||
current_sample = fluid.layers.reshape(
|
||||
current_sample, current_sample.shape + [1, 1])
|
||||
|
||||
# syn_audio: [num_samples]
|
||||
syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
|
||||
|
||||
return syn_audio
|
||||
return syn_audio
|
||||
|
||||
def start_new_sequence(self):
|
||||
for layer in self.sublayers():
|
||||
|
|
|
@ -1,2 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from . import weight_norm
|
||||
from .customized import *
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle import fluid
|
||||
import paddle.fluid.layers as F
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
@ -7,14 +21,15 @@ class Pool1D(dg.Layer):
|
|||
"""
|
||||
A Pool 1D block implemented with Pool2D.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
pool_size=-1,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=0,
|
||||
global_pooling=False,
|
||||
use_cudnn=True,
|
||||
ceil_mode=False,
|
||||
pool_size=-1,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=0,
|
||||
global_pooling=False,
|
||||
use_cudnn=True,
|
||||
ceil_mode=False,
|
||||
exclusive=True,
|
||||
data_format='NCT'):
|
||||
super(Pool1D, self).__init__()
|
||||
|
@ -28,13 +43,16 @@ class Pool1D(dg.Layer):
|
|||
self.exclusive = exclusive
|
||||
self.data_format = data_format
|
||||
|
||||
self.pool2d = dg.Pool2D(
|
||||
[1, pool_size],
|
||||
pool_type=pool_type,
|
||||
pool_stride=[1, pool_stride],
|
||||
pool_padding=[0, pool_padding],
|
||||
global_pooling=global_pooling,
|
||||
use_cudnn=use_cudnn,
|
||||
ceil_mode=ceil_mode,
|
||||
exclusive=exclusive)
|
||||
|
||||
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
||||
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
||||
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
||||
ceil_mode = ceil_mode, exclusive = exclusive)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
|
@ -53,12 +71,14 @@ class Pool1D(dg.Layer):
|
|||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
||||
|
||||
|
||||
class Conv1D(dg.Conv2D):
|
||||
"""A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and
|
||||
use (B, C, 1, T) data layout to compute 1D convolution. Nothing more.
|
||||
NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple
|
||||
layer, instead of a complex one. So we can easily apply weight norm to it.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
num_filters,
|
||||
|
@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D):
|
|||
use_cudnn=True,
|
||||
act=None,
|
||||
dtype='float32'):
|
||||
super(Conv1D, self).__init__(num_channels,
|
||||
num_filters, (1, filter_size),
|
||||
stride=(1, stride),
|
||||
padding=(0, padding),
|
||||
dilation=(1, dilation),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
super(Conv1D, self).__init__(
|
||||
num_channels,
|
||||
num_filters, (1, filter_size),
|
||||
stride=(1, stride),
|
||||
padding=(0, padding),
|
||||
dilation=(1, dilation),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.unsqueeze(x, [2])
|
||||
|
@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose):
|
|||
use_cudnn=True,
|
||||
act=None,
|
||||
dtype='float32'):
|
||||
super(Conv1DTranspose, self).__init__(num_channels,
|
||||
num_filters, (1, filter_size),
|
||||
output_size=None,
|
||||
padding=(0, padding),
|
||||
stride=(1, stride),
|
||||
dilation=(1, dilation),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
super(Conv1DTranspose, self).__init__(
|
||||
num_channels,
|
||||
num_filters, (1, filter_size),
|
||||
output_size=None,
|
||||
padding=(0, padding),
|
||||
stride=(1, stride),
|
||||
dilation=(1, dilation),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.unsqueeze(x, [2])
|
||||
|
@ -134,6 +156,7 @@ class Conv1DCell(Conv1D):
|
|||
It is a cell that it acts like an RNN cell. It does not support stride > 1, and it
|
||||
ensures 1-to-1 mapping from input time steps to output timesteps.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
num_filters,
|
||||
|
@ -150,18 +173,19 @@ class Conv1DCell(Conv1D):
|
|||
padding = receptive_field - 1 if causal else receptive_field // 2
|
||||
self._receptive_field = receptive_field
|
||||
self.causal = causal
|
||||
super(Conv1DCell, self).__init__(num_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
super(Conv1DCell, self).__init__(
|
||||
num_channels,
|
||||
num_filters,
|
||||
filter_size,
|
||||
stride=1,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
# it ensures that ouput time steps == input time steps
|
||||
|
@ -189,15 +213,16 @@ class Conv1DCell(Conv1D):
|
|||
def add_input(self, x_t):
|
||||
batch_size, c_in, _ = x_t.shape
|
||||
if self._buffer is None:
|
||||
self._buffer = F.zeros((batch_size, c_in, self.receptive_field),
|
||||
dtype=x_t.dtype)
|
||||
self._buffer = F.zeros(
|
||||
(batch_size, c_in, self.receptive_field), dtype=x_t.dtype)
|
||||
self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1)
|
||||
if self._dilation[1] > 1:
|
||||
input = F.strided_slice(self._buffer,
|
||||
axes=[2],
|
||||
starts=[0],
|
||||
ends=[self.receptive_field],
|
||||
strides=[self._dilation[1]])
|
||||
input = F.strided_slice(
|
||||
self._buffer,
|
||||
axes=[2],
|
||||
starts=[0],
|
||||
ends=[self.receptive_field],
|
||||
strides=[self._dilation[1]])
|
||||
else:
|
||||
input = self._buffer
|
||||
input = F.reshape(input, (batch_size, -1))
|
||||
|
|
|
@ -1,6 +1,20 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
class DynamicGRU(dg.Layer):
|
||||
def __init__(self,
|
||||
size,
|
||||
|
@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer):
|
|||
res = res[::-1]
|
||||
res = layers.concat(res, axis=1)
|
||||
return res
|
||||
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
|
@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D
|
|||
|
||||
class PositionwiseFeedForward(dg.Layer):
|
||||
''' A two-feed-forward-layer module '''
|
||||
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
|
||||
|
||||
def __init__(self,
|
||||
d_in,
|
||||
num_hidden,
|
||||
filter_size,
|
||||
padding=0,
|
||||
use_cudnn=True,
|
||||
dropout=0.1):
|
||||
super(PositionwiseFeedForward, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
self.use_cudnn = use_cudnn
|
||||
self.dropout = dropout
|
||||
|
||||
k = math.sqrt(1 / d_in)
|
||||
self.w_1 = Conv1D(num_channels = d_in,
|
||||
num_filters = num_hidden,
|
||||
filter_size = filter_size,
|
||||
padding=padding,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn)
|
||||
self.w_1 = Conv1D(
|
||||
num_channels=d_in,
|
||||
num_filters=num_hidden,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn)
|
||||
k = math.sqrt(1 / num_hidden)
|
||||
self.w_2 = Conv1D(num_channels = num_hidden,
|
||||
num_filters = d_in,
|
||||
filter_size = filter_size,
|
||||
padding=padding,
|
||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||
use_cudnn = use_cudnn)
|
||||
self.w_2 = Conv1D(
|
||||
num_channels=num_hidden,
|
||||
num_filters=d_in,
|
||||
filter_size=filter_size,
|
||||
padding=padding,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer()),
|
||||
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k)),
|
||||
use_cudnn=use_cudnn)
|
||||
self.layer_norm = dg.LayerNorm(d_in)
|
||||
|
||||
def forward(self, input):
|
||||
|
@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer):
|
|||
Returns:
|
||||
output (Variable), Shape(B, T, C), the result after FFN.
|
||||
"""
|
||||
x = layers.transpose(input, [0,2,1])
|
||||
x = layers.transpose(input, [0, 2, 1])
|
||||
#FFN Networt
|
||||
x = self.w_2(layers.relu(self.w_1(x)))
|
||||
|
||||
|
||||
# dropout
|
||||
x = layers.dropout(x, self.dropout)
|
||||
|
||||
x = layers.transpose(x, [0,2,1])
|
||||
x = layers.transpose(x, [0, 2, 1])
|
||||
# residual connection
|
||||
x = x + input
|
||||
|
||||
|
||||
#layer normalization
|
||||
output = self.layer_norm(x)
|
||||
|
||||
return output
|
||||
return output
|
||||
|
|
|
@ -1,37 +1,67 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
class Linear(dg.Layer):
|
||||
def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
is_bias=True,
|
||||
dtype="float32"):
|
||||
super(Linear, self).__init__()
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
self.dtype = dtype
|
||||
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
|
||||
self.bias = is_bias
|
||||
self.weight = fluid.ParamAttr(
|
||||
initializer=fluid.initializer.XavierInitializer())
|
||||
self.bias = is_bias
|
||||
|
||||
if is_bias is not False:
|
||||
k = math.sqrt(1 / in_features)
|
||||
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
|
||||
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
|
||||
low=-k, high=k))
|
||||
|
||||
self.linear = dg.Linear(
|
||||
in_features,
|
||||
out_features,
|
||||
param_attr=self.weight,
|
||||
bias_attr=self.bias, )
|
||||
|
||||
self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
|
||||
bias_attr = self.bias,)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.linear(x)
|
||||
return x
|
||||
|
||||
|
||||
class ScaledDotProductAttention(dg.Layer):
|
||||
def __init__(self, d_key):
|
||||
super(ScaledDotProductAttention, self).__init__()
|
||||
|
||||
self.d_key = d_key
|
||||
|
||||
|
||||
# please attention this mask is diff from pytorch
|
||||
def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
|
||||
def forward(self,
|
||||
key,
|
||||
value,
|
||||
query,
|
||||
mask=None,
|
||||
query_mask=None,
|
||||
dropout=0.1):
|
||||
"""
|
||||
Scaled Dot Product Attention.
|
||||
|
||||
|
@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer):
|
|||
attention (Variable), Shape(n_head * B, T, C), the attention of key.
|
||||
"""
|
||||
# Compute attention score
|
||||
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
|
||||
attention = layers.matmul(
|
||||
query, key, transpose_y=True) #transpose the last dim in y
|
||||
attention = attention / math.sqrt(self.d_key)
|
||||
|
||||
# Mask key to ignore padding
|
||||
if mask is not None:
|
||||
attention = attention * mask
|
||||
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
|
||||
mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
|
||||
attention = attention + mask
|
||||
|
||||
|
||||
attention = layers.softmax(attention)
|
||||
attention = layers.dropout(attention, dropout)
|
||||
|
||||
|
||||
# Mask query to ignore padding
|
||||
if query_mask is not None:
|
||||
attention = attention * query_mask
|
||||
|
||||
|
||||
result = layers.matmul(attention, value)
|
||||
return result, attention
|
||||
|
||||
|
||||
class MultiheadAttention(dg.Layer):
|
||||
def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True):
|
||||
def __init__(self,
|
||||
num_hidden,
|
||||
d_k,
|
||||
d_q,
|
||||
num_head=4,
|
||||
is_bias=False,
|
||||
dropout=0.1,
|
||||
is_concat=True):
|
||||
super(MultiheadAttention, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
self.num_head = num_head
|
||||
|
@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer):
|
|||
|
||||
# repeat masks h times
|
||||
if query_mask is not None:
|
||||
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
|
||||
query_mask = layers.expand(query_mask,
|
||||
[self.num_head, 1, seq_len_key])
|
||||
if mask is not None:
|
||||
mask = layers.expand(mask, (self.num_head, 1, 1))
|
||||
|
||||
|
||||
|
||||
# Make multihead attention
|
||||
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
|
||||
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
|
||||
key = layers.reshape(
|
||||
self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||
value = layers.reshape(
|
||||
self.value(value),
|
||||
[batch_size, seq_len_key, self.num_head, self.d_k])
|
||||
query = layers.reshape(
|
||||
self.query(query_input),
|
||||
[batch_size, seq_len_query, self.num_head, self.d_q])
|
||||
|
||||
key = layers.reshape(
|
||||
layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||
value = layers.reshape(
|
||||
layers.transpose(value, [2, 0, 1, 3]),
|
||||
[-1, seq_len_key, self.d_k])
|
||||
query = layers.reshape(
|
||||
layers.transpose(query, [2, 0, 1, 3]),
|
||||
[-1, seq_len_query, self.d_q])
|
||||
|
||||
result, attention = self.scal_attn(
|
||||
key, value, query, mask=mask, query_mask=query_mask)
|
||||
|
||||
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
|
||||
|
||||
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
|
||||
|
||||
# concat all multihead result
|
||||
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
||||
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
|
||||
result = layers.reshape(
|
||||
result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
||||
result = layers.reshape(
|
||||
layers.transpose(result, [1, 2, 0, 3]),
|
||||
[batch_size, seq_len_query, -1])
|
||||
if self.is_concat:
|
||||
result = layers.concat([query_input,result], axis=-1)
|
||||
result = layers.concat([query_input, result], axis=-1)
|
||||
result = layers.dropout(self.fc(result), self.dropout)
|
||||
result = result + query_input
|
||||
|
||||
|
||||
result = self.layer_norm(result)
|
||||
return result, attention
|
||||
return result, attention
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
@ -10,8 +24,8 @@ def summary(layer):
|
|||
print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
|
||||
num_elements += np.prod(param.shape)
|
||||
num_params += 1
|
||||
print("layer has {} parameters, {} elements.".format(
|
||||
num_params, num_elements))
|
||||
print("layer has {} parameters, {} elements.".format(num_params,
|
||||
num_elements))
|
||||
|
||||
|
||||
def freeze(layer):
|
||||
|
@ -31,5 +45,5 @@ def torch_summary(layer):
|
|||
print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
|
||||
num_elements += np.prod(param.shape)
|
||||
num_params += 1
|
||||
print("layer has {} parameters, {} elements.".format(
|
||||
num_params, num_elements))
|
||||
print("layer has {} parameters, {} elements.".format(num_params,
|
||||
num_elements))
|
||||
|
|
50
setup.py
50
setup.py
|
@ -1,13 +1,27 @@
|
|||
import os
|
||||
import io
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
|
||||
def read(*names, **kwargs):
|
||||
with io.open(
|
||||
os.path.join(os.path.dirname(__file__), *names),
|
||||
encoding=kwargs.get("encoding", "utf8")
|
||||
) as fp:
|
||||
os.path.join(os.path.dirname(__file__), *names),
|
||||
encoding=kwargs.get("encoding", "utf8")) as fp:
|
||||
return fp.read()
|
||||
|
||||
|
||||
|
@ -19,6 +33,7 @@ def find_version(*file_paths):
|
|||
return version_match.group(1)
|
||||
raise RuntimeError("Unable to find version string.")
|
||||
|
||||
|
||||
VERSION = find_version('parakeet', '__init__.py')
|
||||
long_description = read('README.md')
|
||||
|
||||
|
@ -32,17 +47,26 @@ setup_info = dict(
|
|||
description='Speech synthesis tools and models based on Paddlepaddle',
|
||||
long_description=long_description,
|
||||
license='Apache 2',
|
||||
|
||||
install_requires=[
|
||||
'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba',
|
||||
'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy',
|
||||
'ruamel.yaml', 'pandas', 'sox', 'soundfile',
|
||||
'numpy',
|
||||
'nltk',
|
||||
'inflect',
|
||||
'librosa',
|
||||
'unidecode',
|
||||
'numba',
|
||||
'tqdm',
|
||||
'matplotlib',
|
||||
'tensorboardX',
|
||||
'tensorboard',
|
||||
'scipy',
|
||||
'ruamel.yaml',
|
||||
'pandas',
|
||||
'sox',
|
||||
'soundfile',
|
||||
],
|
||||
|
||||
# Package info
|
||||
packages=find_packages(exclude=('tests', 'tests.*')),
|
||||
zip_safe=True, )
|
||||
|
||||
zip_safe=True,
|
||||
)
|
||||
|
||||
setup(**setup_info)
|
||||
setup(**setup_info)
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.datasets.ljspeech import LJSpeech
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
|
||||
|
|
|
@ -1,11 +1,25 @@
|
|||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from parakeet.datasets import vctk
|
||||
from pathlib import Path
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
|
||||
root = Path("/workspace/datasets/VCTK-Corpus")
|
||||
vctk_dataset = vctk.VCTK(root)
|
||||
vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
|
||||
vctk_cargo = DataCargo(
|
||||
vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
|
||||
|
||||
for i, batch in enumerate(vctk_cargo):
|
||||
print(i)
|
||||
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import argparse
|
||||
import io, re
|
||||
import sys, os
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
COPYRIGHT = '''
|
||||
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
LANG_COMMENT_MARK = None
|
||||
|
||||
NEW_LINE_MARK = None
|
||||
|
||||
COPYRIGHT_HEADER = None
|
||||
|
||||
if platform.system() == "Windows":
|
||||
NEW_LINE_MARK = "\r\n"
|
||||
else:
|
||||
NEW_LINE_MARK = '\n'
|
||||
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
|
||||
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
|
||||
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
|
||||
date, err = process.communicate()
|
||||
date = date.decode("utf-8").rstrip("\n")
|
||||
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
|
||||
|
||||
|
||||
def generate_copyright(template, lang='C'):
|
||||
if lang == 'Python':
|
||||
LANG_COMMENT_MARK = '#'
|
||||
else:
|
||||
LANG_COMMENT_MARK = "//"
|
||||
|
||||
lines = template.split(NEW_LINE_MARK)
|
||||
BLANK = " "
|
||||
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
|
||||
for lino, line in enumerate(lines):
|
||||
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
|
||||
if len(line) == 0:
|
||||
BLANK = ""
|
||||
else:
|
||||
BLANK = " "
|
||||
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
|
||||
|
||||
return ans + "\n"
|
||||
|
||||
|
||||
def lang_type(filename):
|
||||
if filename.endswith(".py"):
|
||||
return "Python"
|
||||
elif filename.endswith(".h"):
|
||||
return "C"
|
||||
elif filename.endswith(".c"):
|
||||
return "C"
|
||||
elif filename.endswith(".hpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cc"):
|
||||
return "C"
|
||||
elif filename.endswith(".cpp"):
|
||||
return "C"
|
||||
elif filename.endswith(".cu"):
|
||||
return "C"
|
||||
elif filename.endswith(".cuh"):
|
||||
return "C"
|
||||
elif filename.endswith(".go"):
|
||||
return "C"
|
||||
elif filename.endswith(".proto"):
|
||||
return "C"
|
||||
else:
|
||||
print("Unsupported filetype %s", filename)
|
||||
exit(0)
|
||||
|
||||
|
||||
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Checker for copyright declaration.')
|
||||
parser.add_argument('filenames', nargs='*', help='Filenames to check')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
retv = 0
|
||||
for filename in args.filenames:
|
||||
fd = io.open(filename, encoding="utf-8")
|
||||
first_line = fd.readline()
|
||||
second_line = fd.readline()
|
||||
if "COPYRIGHT (C)" in first_line.upper(): continue
|
||||
if first_line.startswith("#!") or PYTHON_ENCODE.match(
|
||||
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
|
||||
continue
|
||||
original_contents = io.open(filename, encoding="utf-8").read()
|
||||
new_contents = generate_copyright(
|
||||
COPYRIGHT, lang_type(filename)) + original_contents
|
||||
print('Auto Insert Copyright Header {}'.format(filename))
|
||||
retv = 1
|
||||
with io.open(filename, 'w') as output_file:
|
||||
output_file.write(new_contents)
|
||||
|
||||
return retv
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
Loading…
Reference in New Issue