Compare commits

...

101 Commits

Author SHA1 Message Date
Hui Zhang c272a843e9
Merge pull request #118 from PaddlePaddle/revert-116-fastspeech
Revert "Add models/fastspeech2"
2021-06-16 14:22:00 +08:00
Hui Zhang ffcafb9b18
Revert "Add models/fastspeech2" 2021-06-16 14:21:24 +08:00
Hui Zhang 8224983d10
Merge pull request #116 from iclementine/fastspeech
Add models/fastspeech2
2021-06-16 14:20:30 +08:00
Feiyu Chan 106e8cf770
Merge pull request #112 from iclementine/release/v0.3
dump version to 0.3.1
2021-05-20 11:09:19 +08:00
chenfeiyu 276df568e2 bump version to 0.3.1 2021-05-20 10:27:01 +08:00
chenfeiyu 88e97a5963 Merge branch 'release/v0.3' of https://github.com/PaddlePaddle/Parakeet into release/v0.3 2021-05-18 18:56:07 +08:00
Feiyu Chan d726863138
fix a config key error (#110) 2021-05-18 18:13:36 +08:00
chenfeiyu 8caefd0094 fix a config key error 2021-05-18 17:53:09 +08:00
chenfeiyu fa6ddf5b0c bump version string to 0.3.0 2021-05-17 11:33:39 +08:00
chenfeiyu c02adfdad8 Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into develop 2021-05-17 11:29:31 +08:00
chenfeiyu e1a7c296fe simplify text processing code and update notebook 2021-05-13 17:06:34 +08:00
chenfeiyu 6a1fb158d9 format code with pre-commit 2021-05-13 16:22:56 +08:00
chenfeiyu 73ca693395 add praatio into requirements for running the experiments 2021-05-11 22:46:09 +08:00
chenfeiyu 2f644e1b8b refine READMEs and clean code 2021-05-11 22:44:02 +08:00
chenfeiyu 8bcbcf8a86 add links to downlaod pretrained models 2021-05-07 16:49:11 +08:00
chenfeiyu 71a87559da update README 2021-05-07 16:28:23 +08:00
chenfeiyu 664fc20c0a update doc 2021-05-07 16:16:58 +08:00
chenfeiyu b9aa61b5eb update docstrings for tacotron 2021-05-07 16:08:31 +08:00
chenfeiyu f197e4d04f update README and doc 2021-05-07 15:35:47 +08:00
chenfeiyu ef1ea56ed6 fix typos and docs 2021-05-07 15:03:54 +08:00
chenfeiyu 38831bf8b6 add extra_config keys into the default config of tacotron 2021-04-30 14:27:08 +08:00
chenfeiyu b88a0f90aa add STFT back 2021-04-29 17:54:07 +08:00
iclementine 42092f1f5b update README for examples/ge2e 2021-04-29 17:15:18 +08:00
iclementine b1304cb449 add images for exampels/tacotron2_aishell3's README 2021-04-29 17:09:40 +08:00
iclementine cab12c2dfd update tacotron_aishell3's README 2021-04-29 17:00:26 +08:00
iclementine ba7639b994 update tacotron2 2021-04-29 16:43:03 +08:00
iclementine 123bbe994f update tacotron2 from_pretrained, update setup.py 2021-04-29 16:04:32 +08:00
iclementine 701376f401 remove tacotron2_msp 2021-04-28 20:05:12 +08:00
iclementine 77eb13d95d format code 2021-04-28 20:02:29 +08:00
chenfeiyu cbe531158e add plot_multiple_attentions and update visualization code in transformer_tts 2021-04-27 17:40:50 +08:00
chenfeiyu 263d3eb88b add an optional to alter the loss and model structure of tacotron2, add an alternative config 2021-04-26 21:18:29 +08:00
chenfeiyu 4fc86abf5a Merge branch 'baker' of https://github.com/iclementine/Parakeet into baker 2021-04-25 11:11:36 +08:00
chenfeiyu 85649725fb add voice cloning notebook 2021-04-25 11:11:24 +08:00
iclementine cf01a0da22 add more details to thr README, fix some preprocess scripts 2021-04-25 11:00:42 +08:00
iclementine 4426417da1 WIP: add README 2021-04-22 17:40:36 +08:00
iclementine e8a9a118bb clean code for data processing 2021-04-22 17:20:34 +08:00
iclementine 56f2552201 fix argument name 2021-04-22 14:50:52 +08:00
chenfeiyu c2560e8aa2 fix argument order 2021-04-22 13:46:51 +08:00
iclementine 3a744dbf30 clean code 2021-04-22 13:25:25 +08:00
iclementine 764c35e99e move tacotron2_msp 2021-04-22 11:00:33 +08:00
chenfeiyu c8627fdd75 remove imports to deleted modules 2021-04-20 20:12:57 +08:00
chenfeiyu 16b4d4eecb remove files not included in this release 2021-04-20 17:12:22 +08:00
chenfeiyu 6b3999217b remove imports that are removed 2021-04-20 15:54:55 +08:00
iclementine e992e17456 resolve conflict 2021-04-19 20:17:21 +08:00
iclementine 0eea7cc373 fix typos 2021-04-19 20:15:46 +08:00
iclementine f8f3ec4709 Merge branch 'baker' of github.com:iclementine/Parakeet into baker 2021-04-19 20:12:07 +08:00
chenfeiyu 9da118e53b merge wavenet 2021-04-19 20:09:01 +08:00
chenfeiyu 3741cc49ca change wavenet to use on-the-fly prepeocessing 2021-04-19 19:58:36 +08:00
iclementine b53b274585 change batch_text_id, batch_spec, batch_wav to include valid lengths in the returned value 2021-04-19 17:06:52 +08:00
iclementine 6749ce40ea add audio datasets 2021-04-19 16:17:30 +08:00
iclementine 49f2c4b3fb change stft to use conv1d 2021-04-16 15:01:10 +08:00
iclementine e06c6cdfe1 WIP:update hifigan 2021-04-15 17:23:42 +08:00
iclementine 68497f89a4 WIP: add hifigan 2021-04-14 20:59:26 +08:00
chenfeiyu e54f23befd update collate function, data loader not does not convert nested list into numpy array. 2021-04-14 20:51:13 +08:00
chenfeiyu c6965e2c5a fix fmax for example/waveflow 2021-04-14 20:50:38 +08:00
iclementine b674f63d74 add 2 frontend 2021-04-08 04:59:29 +08:00
iclementine 184745f42b add gst layer 2021-04-08 04:59:03 +08:00
iclementine dc3b798f82 add global condition support for tacotron2 2021-04-08 04:58:44 +08:00
chenfeiyu 5011f16c10 minor fix 2021-04-07 10:55:05 +08:00
iclementine 4d3014f4d5 add new trainer 2021-04-03 16:19:46 +08:00
iclementine 27e0201d0d format code for tacotron_vctk, add plot_waveform to display 2021-04-02 15:46:28 +08:00
iclementine a3fae49022 merge refactor_tacotron 2021-04-02 11:48:16 +08:00
iclementine 274d8dce07 update experiment and display 2021-04-02 11:37:48 +08:00
iclementine 15b205d6e0 Merge branch 'develop' into baker 2021-04-02 11:23:21 +08:00
chenfeiyu 8d67066765 add example for baker and aishell3 2021-04-02 11:06:34 +08:00
chenfeiyu 9babec0f98 fix text log extention name 2021-04-01 13:49:52 +08:00
chenfeiyu 752272de98 fix bugs 2021-04-01 13:15:06 +08:00
iclementine e0052ccedf fix typos 2021-03-31 19:38:12 +08:00
iclementine a834e132b9 fix root path 2021-03-31 19:36:48 +08:00
iclementine dd73ee6611 fix root path 2021-03-31 19:35:59 +08:00
iclementine 883bc16d24 fix root path 2021-03-31 19:33:33 +08:00
iclementine 9798d07337 fix visualizer 2021-03-31 19:32:23 +08:00
iclementine f84d460613 fix class name 2021-03-31 19:31:16 +08:00
iclementine 327c7a5ce4 fix indentation 2021-03-31 19:29:09 +08:00
iclementine 4a039b6407 add vctk example for refactored tacotron 2021-03-31 17:34:19 +08:00
iclementine 7cc3e8c340 add a simple strategy to support multispeaker for tacotron. 2021-03-31 15:23:41 +08:00
iclementine 2dd393349f Merge branch 'develop' into refactor_tacotron 2021-03-30 16:01:22 +08:00
iclementine e3f7bb5a51 simplify visualization code 2021-03-30 15:56:14 +08:00
chenfeiyu 0fdb86834b Merge branch 'develop' into baker 2021-03-30 14:39:11 +08:00
chenfeiyu b5dd0cc197 fix speaker encoder and add support for 2 more datasets 2021-03-30 14:38:44 +08:00
iclementine 4757f08550 Merge branch 'develop' into baker 2021-03-29 11:17:51 +08:00
iclementine 59ed247840 fix lstm speaker encoder 2021-03-29 11:17:23 +08:00
iclementine ab85d5ca13 Merge branch 'develop' into baker 2021-03-29 11:13:57 +08:00
iclementine 5443e23fb7 fix lstm speaker encoder 2021-03-29 11:12:02 +08:00
iclementine 6defef4944 Merge branch 'baker' of github.com:iclementine/Parakeet into baker 2021-03-29 10:49:24 +08:00
chenfeiyu 489fb69f55 Merge branch 'develop' into baker 2021-03-29 10:49:34 +08:00
iclementine a9a78742fa Merge branch 'develop' into baker 2021-03-29 10:42:17 +08:00
iclementine 2475da3322 add ge2e 2021-03-27 17:39:37 +08:00
chenfeiyu a005cc88a3 WIP: baker 2021-03-27 12:43:03 +08:00
iclementine 2b62fbb614 1. change the default min value of LogMagnitude to 1e-5;
2. remove stop logit prediction from tacotron2 model.
2021-03-23 10:44:22 +08:00
iclementine da63cfa42e add an embedding layer. 2021-03-22 21:39:22 +08:00
iclementine f9d6160916 add an option to normalize volume when loading audio. 2021-03-22 21:38:28 +08:00
iclementine 086fbf8e35 refactoring code 2021-03-22 21:23:46 +08:00
chenfeiyu 3c60fec900 remove bn in postnet 2021-02-27 03:26:41 +08:00
chenfeiyu 929165b64a 1. remove space from numericalized representation;
2. fix decoder paddign mask's unsqueeze dim.
2021-02-27 02:59:38 +08:00
chenfeiyu ae9e218073 use emb add in tacotron2 2021-02-26 18:08:26 +08:00
chenfeiyu 40237c40b0 Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into baker 2021-02-26 11:07:03 +08:00
chenfeiyu 9e4d5a3d8a fix experiments for waveflow and wavenet, only write visual log in rank-0 2021-02-21 17:30:13 +08:00
chenfeiyu 6a92fde9b2 Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into baker 2021-02-18 19:58:27 +08:00
chenfeiyu 25bd8987a6 Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into baker 2021-02-18 19:51:56 +08:00
chenfeiyu 239703be8b hacky thing, add tone support for acoustic model 2021-02-10 22:58:08 +08:00
4 changed files with 1 additions and 927 deletions

View File

@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.2.0-beta.0"
__version__ = "0.3.1"
from parakeet import audio, data, datasets, frontend, models, modules, training, utils

View File

@ -1,712 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from paddle.fluid.layers import sequence_mask
from parakeet.modules.positioning import position_encoding
from parakeet.modules.attention import (_split_heads, _concat_heads,
scaled_dot_product_attention)
from parakeet.modules import geometry as geo
from parakeet.modules.conv import Conv1dBatchNorm
from typing import Optional
class FastSpeechFeedForwardTransformer(nn.Layer):
def __init__(self,
num_layers,
model_dim,
num_heads,
ffn_dim,
ffn_kernel_size,
attention_dropout=0.,
residual_dropout=0.,
num_speakers=1,
max_position=1000,
input_dim: Optional[int]=None,
epsilon=1e-5,
scheme="post"):
super().__init__()
# optional input layer
input_dim = input_dim or model_dim
self.input_dim = input_dim
self.model_dim = model_dim
if input_dim != model_dim:
self.input_fc = nn.Linear(input_dim, model_dim)
self.pos_embedding = position_encoding(1 + max_position, model_dim)
self.num_speakers = num_speakers
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, model_dim)
self.speaker_fc = nn.Linear(model_dim, model_dim)
self.layers = nn.LayerList([
FastSpeechFFTBlock(model_dim, num_heads, ffn_dim, ffn_kernel_size,
attention_dropout, residual_dropout, epsilon,
scheme) for _ in range(num_layers)
])
def forward(self, x, mask, speaker_ids=None):
"""
x: [B, T, C]
mask: [B, 1, T] or [B, T, T]
returns: [B, T, C]
"""
if self.input_dim != self.model_dim:
x = self.input_fc(x)
batch_size, time_steps, _ = x.shape
pos_embed = self.pos_embedding[1:1 + time_steps, :]
x += pos_embed
if self.num_speakers > 1:
speaker_embedding = self.speaker_embedding(speaker_ids)
speaker_feature = F.softplus(self.speaker_fc(speaker_embedding))
speaker_feature = paddle.unsqueeze(speaker_feature, 1) # [B, T, C]
x += speaker_feature
for layer in self.layers:
x, attn = layer(x, mask)
# we do not return attention here
return x
class MultiheadAttention(nn.Layer):
def __init__(self,
model_dim: int,
num_heads: int,
k_input_dim: Optional[int]=None,
v_input_dim: Optional[int]=None,
dropout: float=0.):
super().__init__()
if model_dim % num_heads != 0:
raise ValueError("model_dim must be divisible by num_heads")
depth = model_dim // num_heads
k_input_dim = k_input_dim or model_dim
v_input_dim = v_input_dim or model_dim
self.wq = nn.Linear(model_dim, model_dim)
self.wk = nn.Linear(k_input_dim, model_dim)
self.wv = nn.Linear(v_input_dim, model_dim)
self.wo = nn.Linear(model_dim, model_dim)
self.num_heads = num_heads
self.model_dim = model_dim
self.dropout = dropout
def forward(self, q, k, v, mask=None):
q = _split_heads(self.wq(q), self.num_heads) # (B, h, T, C)
k = _split_heads(self.wk(k), self.num_heads)
v = _split_heads(self.wv(v), self.num_heads)
if mask is not None:
mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
context_vectors, attention_weights = scaled_dot_product_attention(
q, k, v, mask, dropout=self.dropout, training=self.training)
context_vectors = _concat_heads(context_vectors)
context_vectors = self.wo(context_vectors)
return context_vectors, attention_weights
class FastSpeechSelfAttentionNorm(nn.Layer):
"""Self attention & Layer normalization, both schemes are supported."""
def __init__(self,
model_dim,
num_heads,
attention_dropout=0.,
residual_dropout=0.,
epsilon=1e-5,
scheme="post"):
super().__init__()
if scheme not in ["post", "pre"]:
raise ValueError("scheme should be 'pre' or 'post'")
self.scheme = scheme
self.attention = MultiheadAttention(
model_dim, num_heads, dropout=attention_dropout)
self.layer_norm = nn.LayerNorm([model_dim], epsilon=epsilon)
self.dropout_layer = nn.Dropout(residual_dropout)
def forward(self, x, mask=None):
# [B, T, C], [B, 1, T] -> [B, T, C], [B, T, T]
if self.scheme is "post":
c, w = self.attention(x, x, x, mask=mask)
out = self.layer_norm(x + self.dropout_layer(c))
else:
normalized_x = self.layer_norm(x)
c, w = self.attention(
normalized_x, normalized_x, normalized_x, mask=mask)
out = x + self.dropout_layer(c)
c *= paddle.transpose(mask, [0, 2, 1]) # mask padding positions
return out, w
class FastSpeechFFN(nn.Layer):
"""FFN, it can either be 2 linear or 2 conv1d."""
def __init__(self, model_dim, hidden_dim, kernel_size=1):
super().__init__()
if kernel_size == 1:
self.layer1 = nn.Linear(model_dim, hidden_dim)
self.layer2 = nn.Linear(hidden_dim, model_dim)
else:
self.layer1 = nn.Conv1D(
model_dim,
hidden_dim,
kernel_size,
padding="same",
data_format="NLC")
self.layer2 = nn.Conv1D(
hidden_dim,
model_dim,
kernel_size,
padding="same",
data_format="NLC")
def forward(self, x, mask=None):
# [B, T, C], [B, T] -> [B, T, C]
h = self.layer1(x)
h = F.relu(h) # TODO: use mish here?
h = self.layer2(h)
h *= paddle.unsqueeze(mask, -1) # mask padding positions
return h
class FastSpeechFFNNorm(nn.Layer):
def __init__(self,
model_dim,
hidden_dim,
kernel_size,
residual_dropout=0.,
epsilon=1e-5,
scheme="post"):
super().__init__()
if scheme not in ["post", "pre"]:
raise ValueError("scheme should be 'pre' or 'post'")
self.scheme = scheme
self.ffn = FastSpeechFFN(
model_dim, hidden_dim, kernel_size=kernel_size)
self.layer_norm = nn.LayerNorm([model_dim], epsilon=epsilon)
self.dropout_layer = nn.Dropout(residual_dropout)
def forward(self, x, mask=None):
if self.scheme == "post":
h = self.ffn(x, mask)
out = self.layer_norm(x + self.dropout_layer(h))
else:
normalized_x = self.layer_norm(x)
h = self.ffn(normalized_x, mask)
out = x + self.dropout_layer(h)
out *= paddle.unsqueeze(mask, -1) # mask padding positions
return out
class FastSpeechFFTBlock(nn.Layer):
def __init__(self,
model_dim,
num_heads,
ffn_dim,
ffn_kernel_size,
attention_dropout=0.,
residual_dropout=0.,
epsilon=1e-5,
scheme="post"):
super().__init__()
self.attention = FastSpeechSelfAttentionNorm(
model_dim, num_heads, attention_dropout, residual_dropout, epsilon,
scheme)
self.ffn = FastSpeechFFNNorm(model_dim, ffn_dim, ffn_kernel_size,
residual_dropout, epsilon, scheme)
def forward(self, x, mask):
# [B, T, C]
# [B, 1, T]
c, w = self.attention(x, mask)
c = self.ffn(c, paddle.squeeze(mask))
return c, w
class FastSpeechDurationPredictor(nn.Layer):
def __init__(self,
num_layers: int,
input_dim: int,
hidden_dim: int,
kernel_size: int,
dropout: float=0.,
epsilon: float=1e-5):
super().__init__()
convs = []
for i in range(num_layers):
conv = nn.Conv1D(
input_dim if i == 0 else hidden_dim,
hidden_dim,
kernel_size,
padding="same",
data_format="NLC")
layer_norm = nn.LayerNorm([hidden_dim], epsilon=epsilon)
act = nn.ReLU6()
dropout_layer = nn.Dropout(dropout)
convs.extend([conv, layer_norm, act, dropout_layer])
self.conv_layers = nn.Sequential(*convs)
self.output_fc = nn.Linear(hidden_dim, 1)
def forward(self, x, mask):
# [B, T, C], [B, T] -> [B, T]
mask = paddle.unsqueeze(mask, -1)
x *= mask
h = self.conv_layers(x)
h = self.output_fc(h)
h *= mask
h = F.relu6(h).squeeze(-1)
return h
class FastSpeechLengthRegulator(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, durations):
# [B, T, C], [B, T] -> [B, T', C], [B]
output_lens = paddle.sum(durations, axis=-1)
batch_size = x.shape[0]
expanded_sequences = []
for i in range(batch_size):
expanded_sequence = geo.repeat(x[i], durations[i], axis=0)
expanded_sequences.append(expanded_sequence)
padded_sequence = geo.pad_sequences(expanded_sequences)
return padded_sequence, output_lens
class TacotronPostNet(nn.Layer):
def __init__(self,
num_layers,
input_dim,
hidden_dim,
kernel_size,
dropout=0.,
momentum=0.9,
epsilon=1e-5):
super().__init__()
self.conv_bns = nn.LayerList()
self.num_layers = num_layers
for i in range(num_layers):
convbn = Conv1dBatchNorm(
input_dim if i == 0 else hidden_dim,
hidden_dim if i != num_layers - 1 else input_dim,
kernel_size,
padding="same",
data_format="NLC",
momentum=momentum,
epsilon=epsilon)
self.conv_bns.append(convbn)
self.dropout_layer = nn.Dropout(dropout)
def forward(self, x, mask):
# [B, T, C], [B, T] -> [B, T, C]
mask = paddle.unsqueeze(mask, -1)
for i, convbn in enumerate(self.conv_bns):
x = convbn(x)
if i != self.num_layers - 1:
x = paddle.tanh(x)
x = self.dropout_layer(x)
x *= mask
return x
class FastSpeechVariancePredictor(nn.Layer):
def __init__(self,
num_layers: int,
input_dim: int,
hidden_dim: int,
kernel_size: int,
num_speakers: int=1,
speaker_embedding_size: Optional[int]=None,
dropout: float=0.,
epsilon: float=1e-5):
super().__init__()
convs = []
for i in range(num_layers):
conv = nn.Conv1D(
input_dim if i == 0 else hidden_dim,
hidden_dim,
kernel_size,
padding="same",
data_format="NLC")
act = nn.ReLU()
ln = nn.LayerNorm([hidden_dim], epsilon=epsilon)
dropout_layer = nn.Dropout(dropout)
convs.extend([conv, act, ln, dropout_layer])
self.conv_layers = nn.Sequential(*convs)
self.output_fc = nn.Linear(hidden_dim, 1)
self.num_speakers = num_speakers
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers,
speaker_embedding_size)
self.speaker_fc = nn.Linear(speaker_embedding_size, input_dim)
def forward(self, x, speaker_ids, mask):
# [B, T, C], [B], [B, T] -> [B, T]
if self.num_speakers > 1:
speaker_embed = self.speaker_embeddings(speaker_ids)
speaker_features = F.softplus(self.speaker_fc(speaker_embed))
x += paddle.unsqueeze(speaker_features, 1)
x *= paddle.unsqueeze(mask, -1)
h = self.conv_layers(x)
out = self.output_fc(h)
out = paddle.squeeze(-1) * mask
return out
class FastSpeech(nn.Layer):
def __init__(
self,
vocab_size,
num_speakers,
# encoder params
encoder_num_layers,
encoder_dim,
encoder_num_heads,
encoder_max_position,
encoder_ffn_dim,
encoder_ffn_kernel_size,
# decoder params
decoder_num_layers,
decoder_dim,
decoder_num_heads,
decoder_max_position,
decoder_ffn_dim,
decoder_ffn_kernel_size,
# encoder & decoder common
attention_dropout,
residual_dropout,
# duration predictor
duration_predictor_num_layers,
duration_predictor_dim,
duration_predictor_kernel_size,
duration_predictor_dropout,
# output
mel_dim,
# postnet
postnet_num_layers,
postnet_dim,
postnet_kernel_size,
postnet_dropout,
# other
padding_idx=0,
momentum=0.9,
epsilon=1e-5,
scheme="post"):
super().__init__()
self.embedding = nn.Embedding(
vocab_size, encoder_dim, padding_idx=padding_idx)
self.encoder = FastSpeechFeedForwardTransformer(
encoder_num_layers,
encoder_dim,
encoder_num_heads,
encoder_ffn_dim,
encoder_ffn_kernel_size,
attention_dropout,
residual_dropout,
num_speakers=num_speakers,
max_position=encoder_max_position,
epsilon=epsilon,
scheme=scheme)
self.duration_predictor = FastSpeechDurationPredictor(
duration_predictor_num_layers,
encoder_dim,
duration_predictor_dim,
duration_predictor_kernel_size,
duration_predictor_dropout,
epsilon=epsilon)
self.length_regulator = FastSpeechLengthRegulator()
self.decoder = FastSpeechFeedForwardTransformer(
decoder_num_layers,
decoder_dim,
decoder_num_heads,
decoder_ffn_dim,
decoder_ffn_kernel_size,
attention_dropout,
residual_dropout,
num_speakers=num_speakers,
max_position=decoder_max_position,
input_dim=encoder_dim,
epsilon=epsilon,
scheme=scheme)
self.mel_output_fc = nn.Linear(decoder_dim, mel_dim)
self.postnet = TacotronPostNet(
postnet_num_layers,
mel_dim,
postnet_dim,
postnet_kernel_size,
postnet_dropout,
momentum=momentum,
epsilon=epsilon)
def forward(self, text_ids, speaker_ids, durations, text_lens):
dtype = paddle.get_default_dtype()
encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
embedding = self.embedding(text_ids)
encoder_output = self.encoder(embedding, encoder_attention_mask,
speaker_ids)
# detach the gradient of duration predictor
# a difference here
predicted_durations = self.duration_predictor(encoder_output.detach(),
encoder_padding_mask)
expanded_outputs, mel_lens = self.length_regulator(encoder_output,
durations)
decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
decoder_ouputs = self.decoder(
expanded_outputs,
decoder_attention_mask,
speaker_ids, )
decoder_mel = self.mel_output_fc(decoder_ouputs)
postnet_mel = decoder_mel + self.postnet(decoder_mel,
decoder_padding_mask)
return decoder_mel, postnet_mel, predicted_durations
def inference(self, text_ids, speaker_ids, text_lens, speed_ratios):
dtype = paddle.get_default_dtype()
encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
embedding = self.embedding(text_ids)
encoder_output = self.encoder(embedding, encoder_attention_mask,
speaker_ids)
# detach the gradient flow of duration predictor
# a difference here
predicted_log_durations = self.duration_predictor(
encoder_output.detach(), encoder_padding_mask)
predicted_durations = paddle.exp(predicted_log_durations) - 1.
if speed_ratios is None:
speed_ratios = paddle.ones([1], dtype=dtype)
speed_ratios = paddle.unsqueeze(speed_ratios, -1)
predicted_durations = paddle.round(predicted_durations *
speed_ratios).astype("int32")
expanded_outputs, mel_lens = self.length_regulator(encoder_output,
predicted_durations)
decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
decoder_ouputs = self.decoder(expanded_outputs, decoder_attention_mask,
speaker_ids)
decoder_mel = self.mel_output_fc(decoder_ouputs)
postnet_mel = decoder_mel + self.postnet(decoder_mel,
decoder_padding_mask)
return decoder_mel, postnet_mel, predicted_durations
# TODO: implement FastSpeech2
class FastSpeech2(nn.Layer):
def __init__(
self,
vocab_size,
num_speakers,
# encoder params
encoder_num_layers,
encoder_dim,
encoder_num_heads,
encoder_max_position,
encoder_ffn_dim,
encoder_ffn_kernel_size,
# decoder params
decoder_num_layers,
decoder_dim,
decoder_num_heads,
decoder_max_position,
decoder_ffn_dim,
decoder_ffn_kernel_size,
# encoder & decoder common
attention_dropout,
residual_dropout,
# duration predictor
duration_predictor_num_layers,
duration_predictor_dim,
duration_predictor_kernel_size,
duration_predictor_dropout,
# output
mel_dim,
# postnet
postnet_num_layers,
postnet_dim,
postnet_kernel_size,
postnet_dropout,
# variance predictor
variance_predictor_num_layers,
variance_predictor_dim,
variance_predictor_kernel_size,
variance_predictor_dropout,
# other
padding_idx=0,
momentum=0.9,
epsilon=1e-5,
scheme="post"):
super().__init__()
self.embedding = nn.Embedding(
vocab_size, encoder_dim, padding_idx=padding_idx)
self.encoder = FastSpeechFeedForwardTransformer(
encoder_num_layers,
encoder_dim,
encoder_num_heads,
encoder_ffn_dim,
encoder_ffn_kernel_size,
attention_dropout,
residual_dropout,
num_speakers=num_speakers,
max_position=encoder_max_position,
epsilon=epsilon,
scheme=scheme)
self.duration_predictor = FastSpeechDurationPredictor(
duration_predictor_num_layers,
encoder_dim,
duration_predictor_dim,
duration_predictor_kernel_size,
duration_predictor_dropout,
epsilon=epsilon)
self.length_regulator = FastSpeechLengthRegulator()
self.decoder = FastSpeechFeedForwardTransformer(
decoder_num_layers,
decoder_dim,
decoder_num_heads,
decoder_ffn_dim,
decoder_ffn_kernel_size,
attention_dropout,
residual_dropout,
num_speakers=num_speakers,
max_position=decoder_max_position,
input_dim=encoder_dim,
epsilon=epsilon,
scheme=scheme)
self.mel_output_fc = nn.Linear(decoder_dim, mel_dim)
self.postnet = TacotronPostNet(
postnet_num_layers,
mel_dim,
postnet_dim,
postnet_kernel_size,
postnet_dropout,
momentum=momentum,
epsilon=epsilon)
# difference here?
self.f0_predictor = FastSpeechVariancePredictor(
variance_predictor_num_layers,
embed_dim,
variance_predictor_dim,
variancce_predictor_kernel_size,
num_speakers,
speaker_embedding_size=embed_dim)
self.energy_predictor = FastSpeechVariancePredictor(
variance_predictor_num_layers,
embed_dim,
variance_predictor_dim,
variancce_predictor_kernel_size,
num_speakers,
speaker_embedding_size=embed_dim)
#self.duration_predictor = FastSpeechVariancePredictor(
#variance_predictor_num_layers,
#embed_dim,
#variance_predictor_dim,
#variancce_predictor_kernel_size,
#num_speakers,
#speaker_embedding_size=embed_dim)
self.f0_embedding = nn.Conv1D(
1, encoder_dim, kernel_size=9, padding="same", data_format="NLC")
self.f0_dropout_layer = nn.Dropout(0.5)
self.energy_embeddings = nn.Conv1D(
1, encoder_dim, kernel_size=9, padding="same", data_format="NLC")
self.energy_dropout = nn.Dropout(0.5)
def forward(self, text_ids, speaker_ids, durations, text_lens):
dtype = paddle.get_default_dtype()
encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
embedding = self.embedding(text_ids)
encoder_output = self.encoder(embedding, encoder_attention_mask,
speaker_ids)
# detach the gradient of duration predictor
# a difference here
predicted_durations = self.duration_predictor(encoder_output.detach(),
encoder_padding_mask)
expanded_outputs, mel_lens = self.length_regulator(encoder_output,
durations)
decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
decoder_ouputs = self.decoder(
expanded_outputs,
decoder_attention_mask,
speaker_ids, )
decoder_mel = self.mel_output_fc(decoder_ouputs)
postnet_mel = decoder_mel + self.postnet(decoder_mel,
decoder_padding_mask)
return decoder_mel, postnet_mel, predicted_durations
def inference(self, text_ids, speaker_ids, text_lens, speed_ratios):
dtype = paddle.get_default_dtype()
encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
embedding = self.embedding(text_ids)
encoder_output = self.encoder(embedding, encoder_attention_mask,
speaker_ids)
# detach the gradient flow of duration predictor
# a difference here
predicted_log_durations = self.duration_predictor(
encoder_output.detach(), encoder_padding_mask)
predicted_durations = paddle.exp(predicted_log_durations) - 1.
if speed_ratios is None:
speed_ratios = paddle.ones([1], dtype=dtype)
speed_ratios = paddle.unsqueeze(speed_ratios, -1)
predicted_durations = paddle.round(predicted_durations *
speed_ratios).astype("int32")
expanded_outputs, mel_lens = self.length_regulator(encoder_output,
predicted_durations)
decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
decoder_ouputs = self.decoder(expanded_outputs, decoder_attention_mask,
speaker_ids)
decoder_mel = self.mel_output_fc(decoder_ouputs)
postnet_mel = decoder_mel + self.postnet(decoder_mel,
decoder_padding_mask)
return decoder_mel, postnet_mel, predicted_durations

View File

@ -1,162 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Callable, Mapping, List
from pathlib import Path
class KBest(object):
"""
A utility class to help save the hard drive by only keeping K best
checkpoints.
To be as modularized as possible, this class does not assume anything like
a Trainer class or anything like a checkpoint directory, it does not know
about the model or the optimizer, etc.
It is basically a dynamically mantained K-bset Mapping. When a new item is
added to the map, save_fn is called. And when an item is removed from the
map, del_fn is called. `save_fn` and `del_fn` takes a Path object as input
and returns nothing.
Though it is designed to control checkpointing behaviors, it can be used
to do something else if you pass some save_fn and del_fn.
Example
--------
>>> from pathlib import Path
>>> import shutil
>>> import paddle
>>> from paddle import nn
>>> model = nn.Linear(2, 3)
>>> def save_model(path):
... paddle.save(model.state_dict(), path)
>>> kbest_manager = KBest(max_size=5, save_fn=save_model)
>>> checkpoint_dir = Path("checkpoints")
>>> shutil.rmtree(checkpoint_dir)
>>> checkpoint_dir.mkdir(parents=True)
>>> a = np.random.rand(20)
>>> for i, score in enumerate(a):
... path = checkpoint_dir / f"step_{i}"
... kbest_manager.add_checkpoint(score, path)
>>> assert len(list(checkpoint_dir.glob("step_*"))) == 5
"""
def __init__(self,
max_size: int=5,
save_fn: Callable[[Path], None]=None,
del_fn: Callable[[Path], None]=lambda f: f.unlink()):
self.best_records: Mapping[Path, float] = {}
self.save_fn = save_fn
self.del_fn = del_fn
self.max_size = max_size
self._save_all = (max_size == -1)
def should_save(self, metric: float) -> bool:
if not self.full():
return True
# already full
worst_record_path = max(self.best_records, key=self.best_records.get)
worst_metric = self.best_records[worst_record_path]
return metric < worst_metric
def full(self):
return (not self._save_all) and len(self.best_records) == self.max_size
def add_checkpoint(self, metric, path):
if self.should_save(metric):
self.save_checkpoint_and_update(metric, path)
def save_checkpoint_and_update(self, metric, path):
# remove the worst
if self.full():
worst_record_path = max(self.best_records,
key=self.best_records.get)
self.best_records.pop(worst_record_path)
self.del_fn(worst_record_path)
# add the new one
self.save_fn(path)
self.best_records[path] = metric
class KLatest(object):
"""
A utility class to help save the hard drive by only keeping K latest
checkpoints.
To be as modularized as possible, this class does not assume anything like
a Trainer class or anything like a checkpoint directory, it does not know
about the model or the optimizer, etc.
It is basically a dynamically mantained Queue. When a new item is
added to the queue, save_fn is called. And when an item is removed from the
queue, del_fn is called. `save_fn` and `del_fn` takes a Path object as input
and returns nothing.
Though it is designed to control checkpointing behaviors, it can be used
to do something else if you pass some save_fn and del_fn.
Example
--------
>>> from pathlib import Path
>>> import shutil
>>> import paddle
>>> from paddle import nn
>>> model = nn.Linear(2, 3)
>>> def save_model(path):
... paddle.save(model.state_dict(), path)
>>> klatest_manager = KLatest(max_size=5, save_fn=save_model)
>>> checkpoint_dir = Path("checkpoints")
>>> shutil.rmtree(checkpoint_dir)
>>> checkpoint_dir.mkdir(parents=True)
>>> for i in range(20):
... path = checkpoint_dir / f"step_{i}"
... klatest_manager.add_checkpoint(path)
>>> assert len(list(checkpoint_dir.glob("step_*"))) == 5
"""
def __init__(self,
max_size: int=5,
save_fn: Callable[[Path], None]=None,
del_fn: Callable[[Path], None]=lambda f: f.unlink()):
self.latest_records: List[Path] = []
self.save_fn = save_fn
self.del_fn = del_fn
self.max_size = max_size
self._save_all = (max_size == -1)
def full(self):
return (
not self._save_all) and len(self.latest_records) == self.max_size
def add_checkpoint(self, path):
self.save_checkpoint_and_update(path)
def save_checkpoint_and_update(self, path):
# remove the earist
if self.full():
eariest_record_path = self.latest_records.pop(0)
self.del_fn(eariest_record_path)
# add the new one
self.save_fn(path)
self.latest_records.append(path)

View File

@ -1,52 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import shutil
import numpy as np
from parakeet.training.checkpoint import KBest, KLatest
def test_kbest():
def save_fn(path):
with open(path, 'wt') as f:
f.write(f"My path is {str(path)}\n")
K = 1
kbest_manager = KBest(max_size=K, save_fn=save_fn)
checkpoint_dir = Path("checkpoints")
shutil.rmtree(checkpoint_dir)
checkpoint_dir.mkdir(parents=True)
a = np.random.rand(20)
for i, score in enumerate(a):
path = checkpoint_dir / f"step_{i}"
kbest_manager.add_checkpoint(score, path)
assert len(list(checkpoint_dir.glob("step_*"))) == K
def test_klatest():
def save_fn(path):
with open(path, 'wt') as f:
f.write(f"My path is {str(path)}\n")
K = 5
klatest_manager = KLatest(max_size=K, save_fn=save_fn)
checkpoint_dir = Path("checkpoints")
shutil.rmtree(checkpoint_dir)
checkpoint_dir.mkdir(parents=True)
for i in range(20):
path = checkpoint_dir / f"step_{i}"
klatest_manager.add_checkpoint(path)
assert len(list(checkpoint_dir.glob("step_*"))) == K