Merge pull request #118 from PaddlePaddle/revert-116-fastspeech

Revert "Add models/fastspeech2"
2021-06-16 14:22:00 +08:00 · 2021-06-16 14:21:24 +08:00 · 2021-06-16 14:20:30 +08:00 · 2021-05-20 11:09:19 +08:00 · 2021-05-20 10:27:01 +08:00 · 2021-05-18 18:56:07 +08:00
4 changed files with 1 additions and 927 deletions
--- a/parakeet/init.py
+++ b/parakeet/init.py
@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.2.0-beta.0"
+__version__ = "0.3.1"

 from parakeet import audio, data, datasets, frontend, models, modules, training, utils
--- a/parakeet/models/fastspeech2.py
+++ b/parakeet/models/fastspeech2.py
@ -1,712 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddle.nn import initializer as I
-from paddle.fluid.layers import sequence_mask
-
-from parakeet.modules.positioning import position_encoding
-from parakeet.modules.attention import (_split_heads, _concat_heads,
-                                        scaled_dot_product_attention)
-from parakeet.modules import geometry as geo
-from parakeet.modules.conv import Conv1dBatchNorm
-
-from typing import Optional
-
-
-class FastSpeechFeedForwardTransformer(nn.Layer):
-    def __init__(self,
-                 num_layers,
-                 model_dim,
-                 num_heads,
-                 ffn_dim,
-                 ffn_kernel_size,
-                 attention_dropout=0.,
-                 residual_dropout=0.,
-                 num_speakers=1,
-                 max_position=1000,
-                 input_dim: Optional[int]=None,
-                 epsilon=1e-5,
-                 scheme="post"):
-        super().__init__()
-        # optional input layer
-        input_dim = input_dim or model_dim
-        self.input_dim = input_dim
-        self.model_dim = model_dim
-        if input_dim != model_dim:
-            self.input_fc = nn.Linear(input_dim, model_dim)
-
-        self.pos_embedding = position_encoding(1 + max_position, model_dim)
-
-        self.num_speakers = num_speakers
-        if num_speakers > 1:
-            self.speaker_embedding = nn.Embedding(num_speakers, model_dim)
-            self.speaker_fc = nn.Linear(model_dim, model_dim)
-
-        self.layers = nn.LayerList([
-            FastSpeechFFTBlock(model_dim, num_heads, ffn_dim, ffn_kernel_size,
-                               attention_dropout, residual_dropout, epsilon,
-                               scheme) for _ in range(num_layers)
-        ])
-
-    def forward(self, x, mask, speaker_ids=None):
-        """
-        x: [B, T, C]
-        mask: [B, 1, T] or [B, T, T]
-        returns: [B, T, C]
-        """
-        if self.input_dim != self.model_dim:
-            x = self.input_fc(x)
-
-        batch_size, time_steps, _ = x.shape
-        pos_embed = self.pos_embedding[1:1 + time_steps, :]
-        x += pos_embed
-
-        if self.num_speakers > 1:
-            speaker_embedding = self.speaker_embedding(speaker_ids)
-            speaker_feature = F.softplus(self.speaker_fc(speaker_embedding))
-            speaker_feature = paddle.unsqueeze(speaker_feature, 1)  # [B, T, C]
-            x += speaker_feature
-
-        for layer in self.layers:
-            x, attn = layer(x, mask)
-        # we do not return attention here
-        return x
-
-
-class MultiheadAttention(nn.Layer):
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 k_input_dim: Optional[int]=None,
-                 v_input_dim: Optional[int]=None,
-                 dropout: float=0.):
-        super().__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_input_dim = k_input_dim or model_dim
-        v_input_dim = v_input_dim or model_dim
-        self.wq = nn.Linear(model_dim, model_dim)
-        self.wk = nn.Linear(k_input_dim, model_dim)
-        self.wv = nn.Linear(v_input_dim, model_dim)
-        self.wo = nn.Linear(model_dim, model_dim)
-
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask=None):
-        q = _split_heads(self.wq(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.wk(k), self.num_heads)
-        v = _split_heads(self.wv(v), self.num_heads)
-        if mask is not None:
-            mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, dropout=self.dropout, training=self.training)
-        context_vectors = _concat_heads(context_vectors)
-        context_vectors = self.wo(context_vectors)
-        return context_vectors, attention_weights
-
-
-class FastSpeechSelfAttentionNorm(nn.Layer):
-    """Self attention & Layer normalization, both schemes are supported."""
-
-    def __init__(self,
-                 model_dim,
-                 num_heads,
-                 attention_dropout=0.,
-                 residual_dropout=0.,
-                 epsilon=1e-5,
-                 scheme="post"):
-        super().__init__()
-        if scheme not in ["post", "pre"]:
-            raise ValueError("scheme should be 'pre' or 'post'")
-        self.scheme = scheme
-
-        self.attention = MultiheadAttention(
-            model_dim, num_heads, dropout=attention_dropout)
-        self.layer_norm = nn.LayerNorm([model_dim], epsilon=epsilon)
-        self.dropout_layer = nn.Dropout(residual_dropout)
-
-    def forward(self, x, mask=None):
-        # [B, T, C], [B, 1, T] -> [B, T, C], [B, T, T]
-        if self.scheme is "post":
-            c, w = self.attention(x, x, x, mask=mask)
-            out = self.layer_norm(x + self.dropout_layer(c))
-        else:
-            normalized_x = self.layer_norm(x)
-            c, w = self.attention(
-                normalized_x, normalized_x, normalized_x, mask=mask)
-            out = x + self.dropout_layer(c)
-
-        c *= paddle.transpose(mask, [0, 2, 1])  # mask padding positions
-        return out, w
-
-
-class FastSpeechFFN(nn.Layer):
-    """FFN, it can either be 2 linear or 2 conv1d."""
-
-    def __init__(self, model_dim, hidden_dim, kernel_size=1):
-        super().__init__()
-        if kernel_size == 1:
-            self.layer1 = nn.Linear(model_dim, hidden_dim)
-            self.layer2 = nn.Linear(hidden_dim, model_dim)
-        else:
-            self.layer1 = nn.Conv1D(
-                model_dim,
-                hidden_dim,
-                kernel_size,
-                padding="same",
-                data_format="NLC")
-            self.layer2 = nn.Conv1D(
-                hidden_dim,
-                model_dim,
-                kernel_size,
-                padding="same",
-                data_format="NLC")
-
-    def forward(self, x, mask=None):
-        # [B, T, C], [B, T] -> [B, T, C]
-        h = self.layer1(x)
-        h = F.relu(h)  # TODO: use mish here?
-        h = self.layer2(h)
-        h *= paddle.unsqueeze(mask, -1)  # mask padding positions
-        return h
-
-
-class FastSpeechFFNNorm(nn.Layer):
-    def __init__(self,
-                 model_dim,
-                 hidden_dim,
-                 kernel_size,
-                 residual_dropout=0.,
-                 epsilon=1e-5,
-                 scheme="post"):
-        super().__init__()
-        if scheme not in ["post", "pre"]:
-            raise ValueError("scheme should be 'pre' or 'post'")
-        self.scheme = scheme
-
-        self.ffn = FastSpeechFFN(
-            model_dim, hidden_dim, kernel_size=kernel_size)
-        self.layer_norm = nn.LayerNorm([model_dim], epsilon=epsilon)
-        self.dropout_layer = nn.Dropout(residual_dropout)
-
-    def forward(self, x, mask=None):
-        if self.scheme == "post":
-            h = self.ffn(x, mask)
-            out = self.layer_norm(x + self.dropout_layer(h))
-        else:
-            normalized_x = self.layer_norm(x)
-            h = self.ffn(normalized_x, mask)
-            out = x + self.dropout_layer(h)
-        out *= paddle.unsqueeze(mask, -1)  # mask padding positions
-        return out
-
-
-class FastSpeechFFTBlock(nn.Layer):
-    def __init__(self,
-                 model_dim,
-                 num_heads,
-                 ffn_dim,
-                 ffn_kernel_size,
-                 attention_dropout=0.,
-                 residual_dropout=0.,
-                 epsilon=1e-5,
-                 scheme="post"):
-        super().__init__()
-        self.attention = FastSpeechSelfAttentionNorm(
-            model_dim, num_heads, attention_dropout, residual_dropout, epsilon,
-            scheme)
-        self.ffn = FastSpeechFFNNorm(model_dim, ffn_dim, ffn_kernel_size,
-                                     residual_dropout, epsilon, scheme)
-
-    def forward(self, x, mask):
-        # [B, T, C]
-        # [B, 1, T]
-        c, w = self.attention(x, mask)
-        c = self.ffn(c, paddle.squeeze(mask))
-        return c, w
-
-
-class FastSpeechDurationPredictor(nn.Layer):
-    def __init__(self,
-                 num_layers: int,
-                 input_dim: int,
-                 hidden_dim: int,
-                 kernel_size: int,
-                 dropout: float=0.,
-                 epsilon: float=1e-5):
-        super().__init__()
-        convs = []
-        for i in range(num_layers):
-            conv = nn.Conv1D(
-                input_dim if i == 0 else hidden_dim,
-                hidden_dim,
-                kernel_size,
-                padding="same",
-                data_format="NLC")
-            layer_norm = nn.LayerNorm([hidden_dim], epsilon=epsilon)
-            act = nn.ReLU6()
-            dropout_layer = nn.Dropout(dropout)
-            convs.extend([conv, layer_norm, act, dropout_layer])
-        self.conv_layers = nn.Sequential(*convs)
-        self.output_fc = nn.Linear(hidden_dim, 1)
-
-    def forward(self, x, mask):
-        # [B, T, C], [B, T] -> [B, T]
-        mask = paddle.unsqueeze(mask, -1)
-        x *= mask
-
-        h = self.conv_layers(x)
-        h = self.output_fc(h)
-        h *= mask
-        h = F.relu6(h).squeeze(-1)
-        return h
-
-
-class FastSpeechLengthRegulator(nn.Layer):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, durations):
-        # [B, T, C], [B, T] -> [B, T', C], [B]
-        output_lens = paddle.sum(durations, axis=-1)
-
-        batch_size = x.shape[0]
-        expanded_sequences = []
-        for i in range(batch_size):
-            expanded_sequence = geo.repeat(x[i], durations[i], axis=0)
-            expanded_sequences.append(expanded_sequence)
-        padded_sequence = geo.pad_sequences(expanded_sequences)
-        return padded_sequence, output_lens
-
-
-class TacotronPostNet(nn.Layer):
-    def __init__(self,
-                 num_layers,
-                 input_dim,
-                 hidden_dim,
-                 kernel_size,
-                 dropout=0.,
-                 momentum=0.9,
-                 epsilon=1e-5):
-        super().__init__()
-        self.conv_bns = nn.LayerList()
-        self.num_layers = num_layers
-        for i in range(num_layers):
-            convbn = Conv1dBatchNorm(
-                input_dim if i == 0 else hidden_dim,
-                hidden_dim if i != num_layers - 1 else input_dim,
-                kernel_size,
-                padding="same",
-                data_format="NLC",
-                momentum=momentum,
-                epsilon=epsilon)
-            self.conv_bns.append(convbn)
-        self.dropout_layer = nn.Dropout(dropout)
-
-    def forward(self, x, mask):
-        # [B, T, C], [B, T] -> [B, T, C]
-        mask = paddle.unsqueeze(mask, -1)
-        for i, convbn in enumerate(self.conv_bns):
-            x = convbn(x)
-            if i != self.num_layers - 1:
-                x = paddle.tanh(x)
-            x = self.dropout_layer(x)
-        x *= mask
-        return x
-
-
-class FastSpeechVariancePredictor(nn.Layer):
-    def __init__(self,
-                 num_layers: int,
-                 input_dim: int,
-                 hidden_dim: int,
-                 kernel_size: int,
-                 num_speakers: int=1,
-                 speaker_embedding_size: Optional[int]=None,
-                 dropout: float=0.,
-                 epsilon: float=1e-5):
-        super().__init__()
-        convs = []
-        for i in range(num_layers):
-            conv = nn.Conv1D(
-                input_dim if i == 0 else hidden_dim,
-                hidden_dim,
-                kernel_size,
-                padding="same",
-                data_format="NLC")
-            act = nn.ReLU()
-            ln = nn.LayerNorm([hidden_dim], epsilon=epsilon)
-            dropout_layer = nn.Dropout(dropout)
-            convs.extend([conv, act, ln, dropout_layer])
-        self.conv_layers = nn.Sequential(*convs)
-        self.output_fc = nn.Linear(hidden_dim, 1)
-
-        self.num_speakers = num_speakers
-        if num_speakers > 1:
-            self.speaker_embedding = nn.Embedding(num_speakers,
-                                                  speaker_embedding_size)
-            self.speaker_fc = nn.Linear(speaker_embedding_size, input_dim)
-
-    def forward(self, x, speaker_ids, mask):
-        # [B, T, C], [B], [B, T] -> [B, T]
-        if self.num_speakers > 1:
-            speaker_embed = self.speaker_embeddings(speaker_ids)
-            speaker_features = F.softplus(self.speaker_fc(speaker_embed))
-            x += paddle.unsqueeze(speaker_features, 1)
-
-        x *= paddle.unsqueeze(mask, -1)
-
-        h = self.conv_layers(x)
-        out = self.output_fc(h)
-        out = paddle.squeeze(-1) * mask
-        return out
-
-
-class FastSpeech(nn.Layer):
-    def __init__(
-            self,
-            vocab_size,
-            num_speakers,
-            # encoder params
-            encoder_num_layers,
-            encoder_dim,
-            encoder_num_heads,
-            encoder_max_position,
-            encoder_ffn_dim,
-            encoder_ffn_kernel_size,
-            # decoder params
-            decoder_num_layers,
-            decoder_dim,
-            decoder_num_heads,
-            decoder_max_position,
-            decoder_ffn_dim,
-            decoder_ffn_kernel_size,
-            # encoder & decoder common
-            attention_dropout,
-            residual_dropout,
-            # duration predictor
-            duration_predictor_num_layers,
-            duration_predictor_dim,
-            duration_predictor_kernel_size,
-            duration_predictor_dropout,
-            # output
-            mel_dim,
-            # postnet
-            postnet_num_layers,
-            postnet_dim,
-            postnet_kernel_size,
-            postnet_dropout,
-            # other
-            padding_idx=0,
-            momentum=0.9,
-            epsilon=1e-5,
-            scheme="post"):
-        super().__init__()
-        self.embedding = nn.Embedding(
-            vocab_size, encoder_dim, padding_idx=padding_idx)
-        self.encoder = FastSpeechFeedForwardTransformer(
-            encoder_num_layers,
-            encoder_dim,
-            encoder_num_heads,
-            encoder_ffn_dim,
-            encoder_ffn_kernel_size,
-            attention_dropout,
-            residual_dropout,
-            num_speakers=num_speakers,
-            max_position=encoder_max_position,
-            epsilon=epsilon,
-            scheme=scheme)
-        self.duration_predictor = FastSpeechDurationPredictor(
-            duration_predictor_num_layers,
-            encoder_dim,
-            duration_predictor_dim,
-            duration_predictor_kernel_size,
-            duration_predictor_dropout,
-            epsilon=epsilon)
-        self.length_regulator = FastSpeechLengthRegulator()
-        self.decoder = FastSpeechFeedForwardTransformer(
-            decoder_num_layers,
-            decoder_dim,
-            decoder_num_heads,
-            decoder_ffn_dim,
-            decoder_ffn_kernel_size,
-            attention_dropout,
-            residual_dropout,
-            num_speakers=num_speakers,
-            max_position=decoder_max_position,
-            input_dim=encoder_dim,
-            epsilon=epsilon,
-            scheme=scheme)
-        self.mel_output_fc = nn.Linear(decoder_dim, mel_dim)
-        self.postnet = TacotronPostNet(
-            postnet_num_layers,
-            mel_dim,
-            postnet_dim,
-            postnet_kernel_size,
-            postnet_dropout,
-            momentum=momentum,
-            epsilon=epsilon)
-
-    def forward(self, text_ids, speaker_ids, durations, text_lens):
-        dtype = paddle.get_default_dtype()
-        encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
-        encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
-
-        embedding = self.embedding(text_ids)
-        encoder_output = self.encoder(embedding, encoder_attention_mask,
-                                      speaker_ids)
-
-        # detach the gradient of duration predictor
-        # a difference here
-        predicted_durations = self.duration_predictor(encoder_output.detach(),
-                                                      encoder_padding_mask)
-
-        expanded_outputs, mel_lens = self.length_regulator(encoder_output,
-                                                           durations)
-        decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
-        decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
-
-        decoder_ouputs = self.decoder(
-            expanded_outputs,
-            decoder_attention_mask,
-            speaker_ids, )
-        decoder_mel = self.mel_output_fc(decoder_ouputs)
-        postnet_mel = decoder_mel + self.postnet(decoder_mel,
-                                                 decoder_padding_mask)
-
-        return decoder_mel, postnet_mel, predicted_durations
-
-    def inference(self, text_ids, speaker_ids, text_lens, speed_ratios):
-        dtype = paddle.get_default_dtype()
-        encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
-        encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
-
-        embedding = self.embedding(text_ids)
-        encoder_output = self.encoder(embedding, encoder_attention_mask,
-                                      speaker_ids)
-
-        # detach the gradient flow of duration predictor
-        # a difference here
-        predicted_log_durations = self.duration_predictor(
-            encoder_output.detach(), encoder_padding_mask)
-        predicted_durations = paddle.exp(predicted_log_durations) - 1.
-
-        if speed_ratios is None:
-            speed_ratios = paddle.ones([1], dtype=dtype)
-        speed_ratios = paddle.unsqueeze(speed_ratios, -1)
-        predicted_durations = paddle.round(predicted_durations *
-                                           speed_ratios).astype("int32")
-
-        expanded_outputs, mel_lens = self.length_regulator(encoder_output,
-                                                           predicted_durations)
-        decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
-        decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
-
-        decoder_ouputs = self.decoder(expanded_outputs, decoder_attention_mask,
-                                      speaker_ids)
-        decoder_mel = self.mel_output_fc(decoder_ouputs)
-        postnet_mel = decoder_mel + self.postnet(decoder_mel,
-                                                 decoder_padding_mask)
-
-        return decoder_mel, postnet_mel, predicted_durations
-
-
-# TODO: implement FastSpeech2
-class FastSpeech2(nn.Layer):
-    def __init__(
-            self,
-            vocab_size,
-            num_speakers,
-            # encoder params
-            encoder_num_layers,
-            encoder_dim,
-            encoder_num_heads,
-            encoder_max_position,
-            encoder_ffn_dim,
-            encoder_ffn_kernel_size,
-            # decoder params
-            decoder_num_layers,
-            decoder_dim,
-            decoder_num_heads,
-            decoder_max_position,
-            decoder_ffn_dim,
-            decoder_ffn_kernel_size,
-            # encoder & decoder common
-            attention_dropout,
-            residual_dropout,
-            # duration predictor
-            duration_predictor_num_layers,
-            duration_predictor_dim,
-            duration_predictor_kernel_size,
-            duration_predictor_dropout,
-            # output
-            mel_dim,
-            # postnet
-            postnet_num_layers,
-            postnet_dim,
-            postnet_kernel_size,
-            postnet_dropout,
-            # variance predictor
-            variance_predictor_num_layers,
-            variance_predictor_dim,
-            variance_predictor_kernel_size,
-            variance_predictor_dropout,
-            # other
-            padding_idx=0,
-            momentum=0.9,
-            epsilon=1e-5,
-            scheme="post"):
-        super().__init__()
-        self.embedding = nn.Embedding(
-            vocab_size, encoder_dim, padding_idx=padding_idx)
-        self.encoder = FastSpeechFeedForwardTransformer(
-            encoder_num_layers,
-            encoder_dim,
-            encoder_num_heads,
-            encoder_ffn_dim,
-            encoder_ffn_kernel_size,
-            attention_dropout,
-            residual_dropout,
-            num_speakers=num_speakers,
-            max_position=encoder_max_position,
-            epsilon=epsilon,
-            scheme=scheme)
-        self.duration_predictor = FastSpeechDurationPredictor(
-            duration_predictor_num_layers,
-            encoder_dim,
-            duration_predictor_dim,
-            duration_predictor_kernel_size,
-            duration_predictor_dropout,
-            epsilon=epsilon)
-        self.length_regulator = FastSpeechLengthRegulator()
-        self.decoder = FastSpeechFeedForwardTransformer(
-            decoder_num_layers,
-            decoder_dim,
-            decoder_num_heads,
-            decoder_ffn_dim,
-            decoder_ffn_kernel_size,
-            attention_dropout,
-            residual_dropout,
-            num_speakers=num_speakers,
-            max_position=decoder_max_position,
-            input_dim=encoder_dim,
-            epsilon=epsilon,
-            scheme=scheme)
-        self.mel_output_fc = nn.Linear(decoder_dim, mel_dim)
-        self.postnet = TacotronPostNet(
-            postnet_num_layers,
-            mel_dim,
-            postnet_dim,
-            postnet_kernel_size,
-            postnet_dropout,
-            momentum=momentum,
-            epsilon=epsilon)
-        # difference here?
-        self.f0_predictor = FastSpeechVariancePredictor(
-            variance_predictor_num_layers,
-            embed_dim,
-            variance_predictor_dim,
-            variancce_predictor_kernel_size,
-            num_speakers,
-            speaker_embedding_size=embed_dim)
-        self.energy_predictor = FastSpeechVariancePredictor(
-            variance_predictor_num_layers,
-            embed_dim,
-            variance_predictor_dim,
-            variancce_predictor_kernel_size,
-            num_speakers,
-            speaker_embedding_size=embed_dim)
-        #self.duration_predictor = FastSpeechVariancePredictor(
-        #variance_predictor_num_layers,
-        #embed_dim,
-        #variance_predictor_dim,
-        #variancce_predictor_kernel_size,
-        #num_speakers,
-        #speaker_embedding_size=embed_dim)
-        self.f0_embedding = nn.Conv1D(
-            1, encoder_dim, kernel_size=9, padding="same", data_format="NLC")
-        self.f0_dropout_layer = nn.Dropout(0.5)
-        self.energy_embeddings = nn.Conv1D(
-            1, encoder_dim, kernel_size=9, padding="same", data_format="NLC")
-        self.energy_dropout = nn.Dropout(0.5)
-
-    def forward(self, text_ids, speaker_ids, durations, text_lens):
-        dtype = paddle.get_default_dtype()
-        encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
-        encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
-
-        embedding = self.embedding(text_ids)
-        encoder_output = self.encoder(embedding, encoder_attention_mask,
-                                      speaker_ids)
-
-        # detach the gradient of duration predictor
-        # a difference here
-        predicted_durations = self.duration_predictor(encoder_output.detach(),
-                                                      encoder_padding_mask)
-
-        expanded_outputs, mel_lens = self.length_regulator(encoder_output,
-                                                           durations)
-        decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
-        decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
-
-        decoder_ouputs = self.decoder(
-            expanded_outputs,
-            decoder_attention_mask,
-            speaker_ids, )
-        decoder_mel = self.mel_output_fc(decoder_ouputs)
-        postnet_mel = decoder_mel + self.postnet(decoder_mel,
-                                                 decoder_padding_mask)
-
-        return decoder_mel, postnet_mel, predicted_durations
-
-    def inference(self, text_ids, speaker_ids, text_lens, speed_ratios):
-        dtype = paddle.get_default_dtype()
-        encoder_padding_mask = sequence_mask(text_lens, dtype=dtype)
-        encoder_attention_mask = encoder_padding_mask.unsqueeze(1)
-
-        embedding = self.embedding(text_ids)
-        encoder_output = self.encoder(embedding, encoder_attention_mask,
-                                      speaker_ids)
-
-        # detach the gradient flow of duration predictor
-        # a difference here
-        predicted_log_durations = self.duration_predictor(
-            encoder_output.detach(), encoder_padding_mask)
-        predicted_durations = paddle.exp(predicted_log_durations) - 1.
-
-        if speed_ratios is None:
-            speed_ratios = paddle.ones([1], dtype=dtype)
-        speed_ratios = paddle.unsqueeze(speed_ratios, -1)
-        predicted_durations = paddle.round(predicted_durations *
-                                           speed_ratios).astype("int32")
-
-        expanded_outputs, mel_lens = self.length_regulator(encoder_output,
-                                                           predicted_durations)
-        decoder_padding_mask = sequence_mask(mel_lens, dtype=dtype)
-        decoder_attention_mask = decoder_padding_mask.unsqueeze(1)
-
-        decoder_ouputs = self.decoder(expanded_outputs, decoder_attention_mask,
-                                      speaker_ids)
-        decoder_mel = self.mel_output_fc(decoder_ouputs)
-        postnet_mel = decoder_mel + self.postnet(decoder_mel,
-                                                 decoder_padding_mask)
-
-        return decoder_mel, postnet_mel, predicted_durations
--- a/parakeet/training/checkpoint.py
+++ b/parakeet/training/checkpoint.py
@ -1,162 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Mapping, List
-from pathlib import Path
-
-
-class KBest(object):
-    """
-    A utility class to help save the hard drive by only keeping K best 
-    checkpoints. 
-    
-    To be as modularized as possible, this class does not assume anything like 
-    a Trainer class or anything like a checkpoint directory, it does not know 
-    about the model or the optimizer, etc. 
-    
-    It is basically a dynamically mantained K-bset Mapping. When a new item is 
-    added to the map, save_fn is called. And when an item is removed from the 
-    map, del_fn is called. `save_fn` and `del_fn` takes a Path object as input 
-    and returns nothing.
-
-    Though it is designed to control checkpointing behaviors, it can be used 
-    to do something else if you pass some save_fn and del_fn.
-
-    Example
-    --------
-
-    >>> from pathlib import Path
-    >>> import shutil
-    >>> import paddle
-    >>> from paddle import nn
-    
-    >>> model = nn.Linear(2, 3)
-    >>> def save_model(path):
-    ...     paddle.save(model.state_dict(), path)
-
-    >>> kbest_manager = KBest(max_size=5, save_fn=save_model)
-    >>> checkpoint_dir = Path("checkpoints")
-    >>> shutil.rmtree(checkpoint_dir)
-    >>> checkpoint_dir.mkdir(parents=True)
-    >>> a = np.random.rand(20)
-    >>> for i, score in enumerate(a):
-    ...     path = checkpoint_dir / f"step_{i}"
-    ...     kbest_manager.add_checkpoint(score, path)
-    >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5
-    """
-
-    def __init__(self,
-                 max_size: int=5,
-                 save_fn: Callable[[Path], None]=None,
-                 del_fn: Callable[[Path], None]=lambda f: f.unlink()):
-        self.best_records: Mapping[Path, float] = {}
-        self.save_fn = save_fn
-        self.del_fn = del_fn
-        self.max_size = max_size
-        self._save_all = (max_size == -1)
-
-    def should_save(self, metric: float) -> bool:
-        if not self.full():
-            return True
-
-        # already full
-        worst_record_path = max(self.best_records, key=self.best_records.get)
-        worst_metric = self.best_records[worst_record_path]
-        return metric < worst_metric
-
-    def full(self):
-        return (not self._save_all) and len(self.best_records) == self.max_size
-
-    def add_checkpoint(self, metric, path):
-        if self.should_save(metric):
-            self.save_checkpoint_and_update(metric, path)
-
-    def save_checkpoint_and_update(self, metric, path):
-        # remove the worst
-        if self.full():
-            worst_record_path = max(self.best_records,
-                                    key=self.best_records.get)
-            self.best_records.pop(worst_record_path)
-            self.del_fn(worst_record_path)
-
-        # add the new one
-        self.save_fn(path)
-        self.best_records[path] = metric
-
-
-class KLatest(object):
-    """
-    A utility class to help save the hard drive by only keeping K latest 
-    checkpoints. 
-    
-    To be as modularized as possible, this class does not assume anything like 
-    a Trainer class or anything like a checkpoint directory, it does not know 
-    about the model or the optimizer, etc. 
-    
-    It is basically a dynamically mantained Queue. When a new item is 
-    added to the queue, save_fn is called. And when an item is removed from the 
-    queue, del_fn is called. `save_fn` and `del_fn` takes a Path object as input 
-    and returns nothing.
-
-    Though it is designed to control checkpointing behaviors, it can be used 
-    to do something else if you pass some save_fn and del_fn.
-
-    Example
-    --------
-
-    >>> from pathlib import Path
-    >>> import shutil
-    >>> import paddle
-    >>> from paddle import nn
-    
-    >>> model = nn.Linear(2, 3)
-    >>> def save_model(path):
-    ...     paddle.save(model.state_dict(), path)
-
-    >>> klatest_manager = KLatest(max_size=5, save_fn=save_model)
-    >>> checkpoint_dir = Path("checkpoints")
-    >>> shutil.rmtree(checkpoint_dir)
-    >>> checkpoint_dir.mkdir(parents=True)
-    >>> for i in range(20):
-    ...     path = checkpoint_dir / f"step_{i}"
-    ...     klatest_manager.add_checkpoint(path)
-    >>> assert len(list(checkpoint_dir.glob("step_*"))) == 5
-    """
-
-    def __init__(self,
-                 max_size: int=5,
-                 save_fn: Callable[[Path], None]=None,
-                 del_fn: Callable[[Path], None]=lambda f: f.unlink()):
-        self.latest_records: List[Path] = []
-        self.save_fn = save_fn
-        self.del_fn = del_fn
-        self.max_size = max_size
-        self._save_all = (max_size == -1)
-
-    def full(self):
-        return (
-            not self._save_all) and len(self.latest_records) == self.max_size
-
-    def add_checkpoint(self, path):
-        self.save_checkpoint_and_update(path)
-
-    def save_checkpoint_and_update(self, path):
-        # remove the earist
-        if self.full():
-            eariest_record_path = self.latest_records.pop(0)
-            self.del_fn(eariest_record_path)
-
-        # add the new one
-        self.save_fn(path)
-        self.latest_records.append(path)
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@ -1,52 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-import shutil
-
-import numpy as np
-from parakeet.training.checkpoint import KBest, KLatest
-
-
-def test_kbest():
-    def save_fn(path):
-        with open(path, 'wt') as f:
-            f.write(f"My path is {str(path)}\n")
-
-    K = 1
-    kbest_manager = KBest(max_size=K, save_fn=save_fn)
-    checkpoint_dir = Path("checkpoints")
-    shutil.rmtree(checkpoint_dir)
-    checkpoint_dir.mkdir(parents=True)
-    a = np.random.rand(20)
-    for i, score in enumerate(a):
-        path = checkpoint_dir / f"step_{i}"
-        kbest_manager.add_checkpoint(score, path)
-    assert len(list(checkpoint_dir.glob("step_*"))) == K
-
-
-def test_klatest():
-    def save_fn(path):
-        with open(path, 'wt') as f:
-            f.write(f"My path is {str(path)}\n")
-
-    K = 5
-    klatest_manager = KLatest(max_size=K, save_fn=save_fn)
-    checkpoint_dir = Path("checkpoints")
-    shutil.rmtree(checkpoint_dir)
-    checkpoint_dir.mkdir(parents=True)
-    for i in range(20):
-        path = checkpoint_dir / f"step_{i}"
-        klatest_manager.add_checkpoint(path)
-    assert len(list(checkpoint_dir.glob("step_*"))) == K
Author	SHA1	Message	Date
Hui Zhang	c272a843e9	Merge pull request #118 from PaddlePaddle/revert-116-fastspeech Revert "Add models/fastspeech2"	2021-06-16 14:22:00 +08:00
Hui Zhang	ffcafb9b18	Revert "Add models/fastspeech2"	2021-06-16 14:21:24 +08:00
Hui Zhang	8224983d10	Merge pull request #116 from iclementine/fastspeech Add models/fastspeech2	2021-06-16 14:20:30 +08:00
Feiyu Chan	106e8cf770	Merge pull request #112 from iclementine/release/v0.3 dump version to 0.3.1	2021-05-20 11:09:19 +08:00
chenfeiyu	276df568e2	bump version to 0.3.1	2021-05-20 10:27:01 +08:00
chenfeiyu	88e97a5963	Merge branch 'release/v0.3' of https://github.com/PaddlePaddle/Parakeet into release/v0.3	2021-05-18 18:56:07 +08:00
Feiyu Chan	d726863138	fix a config key error (#110 )	2021-05-18 18:13:36 +08:00
chenfeiyu	8caefd0094	fix a config key error	2021-05-18 17:53:09 +08:00
chenfeiyu	fa6ddf5b0c	bump version string to 0.3.0	2021-05-17 11:33:39 +08:00
chenfeiyu	c02adfdad8	Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into develop	2021-05-17 11:29:31 +08:00
chenfeiyu	e1a7c296fe	simplify text processing code and update notebook	2021-05-13 17:06:34 +08:00
chenfeiyu	6a1fb158d9	format code with pre-commit	2021-05-13 16:22:56 +08:00
chenfeiyu	73ca693395	add praatio into requirements for running the experiments	2021-05-11 22:46:09 +08:00
chenfeiyu	2f644e1b8b	refine READMEs and clean code	2021-05-11 22:44:02 +08:00
chenfeiyu	8bcbcf8a86	add links to downlaod pretrained models	2021-05-07 16:49:11 +08:00
chenfeiyu	71a87559da	update README	2021-05-07 16:28:23 +08:00
chenfeiyu	664fc20c0a	update doc	2021-05-07 16:16:58 +08:00
chenfeiyu	b9aa61b5eb	update docstrings for tacotron	2021-05-07 16:08:31 +08:00
chenfeiyu	f197e4d04f	update README and doc	2021-05-07 15:35:47 +08:00
chenfeiyu	ef1ea56ed6	fix typos and docs	2021-05-07 15:03:54 +08:00
chenfeiyu	38831bf8b6	add extra_config keys into the default config of tacotron	2021-04-30 14:27:08 +08:00
chenfeiyu	b88a0f90aa	add STFT back	2021-04-29 17:54:07 +08:00
iclementine	42092f1f5b	update README for examples/ge2e	2021-04-29 17:15:18 +08:00
iclementine	b1304cb449	add images for exampels/tacotron2_aishell3's README	2021-04-29 17:09:40 +08:00
iclementine	cab12c2dfd	update tacotron_aishell3's README	2021-04-29 17:00:26 +08:00
iclementine	ba7639b994	update tacotron2	2021-04-29 16:43:03 +08:00
iclementine	123bbe994f	update tacotron2 from_pretrained, update setup.py	2021-04-29 16:04:32 +08:00
iclementine	701376f401	remove tacotron2_msp	2021-04-28 20:05:12 +08:00
iclementine	77eb13d95d	format code	2021-04-28 20:02:29 +08:00
chenfeiyu	cbe531158e	add plot_multiple_attentions and update visualization code in transformer_tts	2021-04-27 17:40:50 +08:00
chenfeiyu	263d3eb88b	add an optional to alter the loss and model structure of tacotron2, add an alternative config	2021-04-26 21:18:29 +08:00
chenfeiyu	4fc86abf5a	Merge branch 'baker' of https://github.com/iclementine/Parakeet into baker	2021-04-25 11:11:36 +08:00
chenfeiyu	85649725fb	add voice cloning notebook	2021-04-25 11:11:24 +08:00
iclementine	cf01a0da22	add more details to thr README, fix some preprocess scripts	2021-04-25 11:00:42 +08:00
iclementine	4426417da1	WIP: add README	2021-04-22 17:40:36 +08:00
iclementine	e8a9a118bb	clean code for data processing	2021-04-22 17:20:34 +08:00
iclementine	56f2552201	fix argument name	2021-04-22 14:50:52 +08:00
chenfeiyu	c2560e8aa2	fix argument order	2021-04-22 13:46:51 +08:00
iclementine	3a744dbf30	clean code	2021-04-22 13:25:25 +08:00
iclementine	764c35e99e	move tacotron2_msp	2021-04-22 11:00:33 +08:00
chenfeiyu	c8627fdd75	remove imports to deleted modules	2021-04-20 20:12:57 +08:00
chenfeiyu	16b4d4eecb	remove files not included in this release	2021-04-20 17:12:22 +08:00
chenfeiyu	6b3999217b	remove imports that are removed	2021-04-20 15:54:55 +08:00
iclementine	e992e17456	resolve conflict	2021-04-19 20:17:21 +08:00
iclementine	0eea7cc373	fix typos	2021-04-19 20:15:46 +08:00
iclementine	f8f3ec4709	Merge branch 'baker' of github.com:iclementine/Parakeet into baker	2021-04-19 20:12:07 +08:00
chenfeiyu	9da118e53b	merge wavenet	2021-04-19 20:09:01 +08:00
chenfeiyu	3741cc49ca	change wavenet to use on-the-fly prepeocessing	2021-04-19 19:58:36 +08:00
iclementine	b53b274585	change batch_text_id, batch_spec, batch_wav to include valid lengths in the returned value	2021-04-19 17:06:52 +08:00
iclementine	6749ce40ea	add audio datasets	2021-04-19 16:17:30 +08:00
iclementine	49f2c4b3fb	change stft to use conv1d	2021-04-16 15:01:10 +08:00
iclementine	e06c6cdfe1	WIP:update hifigan	2021-04-15 17:23:42 +08:00
iclementine	68497f89a4	WIP: add hifigan	2021-04-14 20:59:26 +08:00
chenfeiyu	e54f23befd	update collate function, data loader not does not convert nested list into numpy array.	2021-04-14 20:51:13 +08:00
chenfeiyu	c6965e2c5a	fix fmax for example/waveflow	2021-04-14 20:50:38 +08:00
iclementine	b674f63d74	add 2 frontend	2021-04-08 04:59:29 +08:00
iclementine	184745f42b	add gst layer	2021-04-08 04:59:03 +08:00
iclementine	dc3b798f82	add global condition support for tacotron2	2021-04-08 04:58:44 +08:00
chenfeiyu	5011f16c10	minor fix	2021-04-07 10:55:05 +08:00
iclementine	4d3014f4d5	add new trainer	2021-04-03 16:19:46 +08:00
iclementine	27e0201d0d	format code for tacotron_vctk, add plot_waveform to display	2021-04-02 15:46:28 +08:00
iclementine	a3fae49022	merge refactor_tacotron	2021-04-02 11:48:16 +08:00
iclementine	274d8dce07	update experiment and display	2021-04-02 11:37:48 +08:00
iclementine	15b205d6e0	Merge branch 'develop' into baker	2021-04-02 11:23:21 +08:00
chenfeiyu	8d67066765	add example for baker and aishell3	2021-04-02 11:06:34 +08:00
chenfeiyu	9babec0f98	fix text log extention name	2021-04-01 13:49:52 +08:00
chenfeiyu	752272de98	fix bugs	2021-04-01 13:15:06 +08:00
iclementine	e0052ccedf	fix typos	2021-03-31 19:38:12 +08:00
iclementine	a834e132b9	fix root path	2021-03-31 19:36:48 +08:00
iclementine	dd73ee6611	fix root path	2021-03-31 19:35:59 +08:00
iclementine	883bc16d24	fix root path	2021-03-31 19:33:33 +08:00
iclementine	9798d07337	fix visualizer	2021-03-31 19:32:23 +08:00
iclementine	f84d460613	fix class name	2021-03-31 19:31:16 +08:00
iclementine	327c7a5ce4	fix indentation	2021-03-31 19:29:09 +08:00
iclementine	4a039b6407	add vctk example for refactored tacotron	2021-03-31 17:34:19 +08:00
iclementine	7cc3e8c340	add a simple strategy to support multispeaker for tacotron.	2021-03-31 15:23:41 +08:00
iclementine	2dd393349f	Merge branch 'develop' into refactor_tacotron	2021-03-30 16:01:22 +08:00
iclementine	e3f7bb5a51	simplify visualization code	2021-03-30 15:56:14 +08:00
chenfeiyu	0fdb86834b	Merge branch 'develop' into baker	2021-03-30 14:39:11 +08:00
chenfeiyu	b5dd0cc197	fix speaker encoder and add support for 2 more datasets	2021-03-30 14:38:44 +08:00
iclementine	4757f08550	Merge branch 'develop' into baker	2021-03-29 11:17:51 +08:00
iclementine	59ed247840	fix lstm speaker encoder	2021-03-29 11:17:23 +08:00
iclementine	ab85d5ca13	Merge branch 'develop' into baker	2021-03-29 11:13:57 +08:00
iclementine	5443e23fb7	fix lstm speaker encoder	2021-03-29 11:12:02 +08:00
iclementine	6defef4944	Merge branch 'baker' of github.com:iclementine/Parakeet into baker	2021-03-29 10:49:24 +08:00
chenfeiyu	489fb69f55	Merge branch 'develop' into baker	2021-03-29 10:49:34 +08:00
iclementine	a9a78742fa	Merge branch 'develop' into baker	2021-03-29 10:42:17 +08:00
iclementine	2475da3322	add ge2e	2021-03-27 17:39:37 +08:00
chenfeiyu	a005cc88a3	WIP: baker	2021-03-27 12:43:03 +08:00
iclementine	2b62fbb614	1. change the default min value of LogMagnitude to 1e-5; 2. remove stop logit prediction from tacotron2 model.	2021-03-23 10:44:22 +08:00
iclementine	da63cfa42e	add an embedding layer.	2021-03-22 21:39:22 +08:00
iclementine	f9d6160916	add an option to normalize volume when loading audio.	2021-03-22 21:38:28 +08:00
iclementine	086fbf8e35	refactoring code	2021-03-22 21:23:46 +08:00
chenfeiyu	3c60fec900	remove bn in postnet	2021-02-27 03:26:41 +08:00
chenfeiyu	929165b64a	1. remove space from numericalized representation; 2. fix decoder paddign mask's unsqueeze dim.	2021-02-27 02:59:38 +08:00
chenfeiyu	ae9e218073	use emb add in tacotron2	2021-02-26 18:08:26 +08:00
chenfeiyu	40237c40b0	Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into baker	2021-02-26 11:07:03 +08:00
chenfeiyu	9e4d5a3d8a	fix experiments for waveflow and wavenet, only write visual log in rank-0	2021-02-21 17:30:13 +08:00
chenfeiyu	6a92fde9b2	Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into baker	2021-02-18 19:58:27 +08:00
chenfeiyu	25bd8987a6	Merge branch 'develop' of https://github.com/PaddlePaddle/Parakeet into baker	2021-02-18 19:51:56 +08:00
chenfeiyu	239703be8b	hacky thing, add tone support for acoustic model	2021-02-10 22:58:08 +08:00