From d1ba42ea6827d797db25ff125dffd27008fda677 Mon Sep 17 00:00:00 2001
From: lifuchen <lifuchen.baidu.com>
Date: Fri, 8 May 2020 03:58:45 +0000
Subject: [PATCH] modified fastspeech to make sure it works on paddle 1.8

---
 examples/fastspeech/data.py              |  8 +-----
 examples/fastspeech/synthesis.py         | 16 ++---------
 examples/fastspeech/train.py             | 19 ++++---------
 parakeet/models/fastspeech/decoder.py    | 21 ++++++++++----
 parakeet/models/fastspeech/encoder.py    | 11 ++++----
 parakeet/models/fastspeech/fastspeech.py | 36 +++---------------------
 6 files changed, 33 insertions(+), 78 deletions(-)

diff --git a/examples/fastspeech/data.py b/examples/fastspeech/data.py
index b7d5abe..da1ffec 100644
--- a/examples/fastspeech/data.py
+++ b/examples/fastspeech/data.py
@@ -186,10 +186,4 @@ def batch_examples(batch):
     mels = np.transpose(
         SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))  #(B,T,num_mels)
 
-    enc_slf_mask = get_attn_key_pad_mask(pos_texts).astype(np.float32)
-    enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32)
-    dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels, mels).astype(np.float32)
-    dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32)
-
-    return (texts, mels, pos_texts, pos_mels, enc_slf_mask, enc_query_mask,
-            dec_slf_mask, dec_query_slf_mask, alignments)
+    return (texts, mels, pos_texts, pos_mels, alignments)
diff --git a/examples/fastspeech/synthesis.py b/examples/fastspeech/synthesis.py
index 781bbcb..6039882 100644
--- a/examples/fastspeech/synthesis.py
+++ b/examples/fastspeech/synthesis.py
@@ -28,7 +28,7 @@ from parakeet.models.fastspeech.fastspeech import FastSpeech
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.wavenet import WaveNet, UpsampleNet
 from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
-from parakeet.utils.layer_tools import summary, freeze
+from parakeet.utils.layer_tools import freeze
 from parakeet.utils import io
 
 
@@ -82,22 +82,11 @@ def synthesis(text_input, args):
         text = np.expand_dims(text, axis=0)
         pos_text = np.arange(1, text.shape[1] + 1)
         pos_text = np.expand_dims(pos_text, axis=0)
-        enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32)
-        enc_slf_attn_mask = get_attn_key_pad_mask(pos_text).astype(np.float32)
 
         text = dg.to_variable(text)
         pos_text = dg.to_variable(pos_text)
-        enc_non_pad_mask = dg.to_variable(enc_non_pad_mask)
-        enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask)
 
-        _, mel_output_postnet = model(
-            text,
-            pos_text,
-            alpha=args.alpha,
-            enc_non_pad_mask=enc_non_pad_mask,
-            enc_slf_attn_mask=enc_slf_attn_mask,
-            dec_non_pad_mask=None,
-            dec_slf_attn_mask=None)
+        _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
 
         result = np.exp(mel_output_postnet.numpy())
         mel_output_postnet = fluid.layers.transpose(
@@ -186,7 +175,6 @@ def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place):
         lmd = config["loss"]["lmd"]
         model = Clarinet(upsample_net, teacher, student, stft,
                          student_log_scale_min, lmd)
-        summary(model)
         io.load_parameters(model=model, checkpoint_path=checkpoint)
 
         if not os.path.exists(args.output):
diff --git a/examples/fastspeech/train.py b/examples/fastspeech/train.py
index 21e8ee9..285063f 100644
--- a/examples/fastspeech/train.py
+++ b/examples/fastspeech/train.py
@@ -79,7 +79,9 @@ def main(args):
                                        (cfg['train']['warm_up_step'] *
                                         (cfg['train']['learning_rate']**2)),
                                        cfg['train']['warm_up_step']),
-            parameter_list=model.parameters())
+            parameter_list=model.parameters(),
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
+                'grad_clip_thresh']))
         reader = LJSpeechLoader(
             cfg['audio'],
             place,
@@ -108,9 +110,7 @@ def main(args):
 
             for i, data in enumerate(pbar):
                 pbar.set_description('Processing at epoch %d' % epoch)
-                (character, mel, pos_text, pos_mel, enc_slf_mask,
-                 enc_query_mask, dec_slf_mask, dec_query_slf_mask,
-                 alignment) = data
+                (character, mel, pos_text, pos_mel, alignment) = data
 
                 global_step += 1
 
@@ -119,11 +119,7 @@ def main(args):
                     character,
                     pos_text,
                     mel_pos=pos_mel,
-                    length_target=alignment,
-                    enc_non_pad_mask=enc_query_mask,
-                    enc_slf_attn_mask=enc_slf_mask,
-                    dec_non_pad_mask=dec_query_slf_mask,
-                    dec_slf_attn_mask=dec_slf_mask)
+                    length_target=alignment)
                 mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                 mel_loss = layers.mse_loss(mel_output, mel)
                 mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
@@ -150,10 +146,7 @@ def main(args):
                     model.apply_collective_grads()
                 else:
                     total_loss.backward()
-                optimizer.minimize(
-                    total_loss,
-                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
-                        'train']['grad_clip_thresh']))
+                optimizer.minimize(total_loss)
                 model.clear_gradients()
 
                 # save checkpoint
diff --git a/parakeet/models/fastspeech/decoder.py b/parakeet/models/fastspeech/decoder.py
index 397685d..78dae16 100644
--- a/parakeet/models/fastspeech/decoder.py
+++ b/parakeet/models/fastspeech/decoder.py
@@ -70,7 +70,7 @@ class Decoder(dg.Layer):
         for i, layer in enumerate(self.layer_stack):
             self.add_sublayer('fft_{}'.format(i), layer)
 
-    def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None):
+    def forward(self, enc_seq, enc_pos):
         """
         Compute decoder outputs.
         
@@ -79,17 +79,26 @@ class Decoder(dg.Layer):
                 the output of length regulator, where T_mel means the timesteps of input spectrum.
             enc_pos (Variable): shape(B, T_mel), dtype int64, 
                 the spectrum position.
-            non_pad_mask (Variable): shape(B, T_mel, 1), dtype int64, the mask with non pad.
-            slf_attn_mask (Variable, optional): shape(B, T_mel, T_mel), dtype int64, 
-                the mask of mel spectrum. Defaults to None.
 
         Returns:
             dec_output (Variable): shape(B, T_mel, C), the decoder output.
             dec_slf_attn_list (list[Variable]): len(n_layers), the decoder self attention list.
         """
         dec_slf_attn_list = []
-        if slf_attn_mask:
-            slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])
+        if fluid.framework._dygraph_tracer()._train_mode:
+            slf_attn_mask = get_dec_attn_key_pad_mask(enc_pos, self.n_head,
+                                                      enc_seq.dtype)
+
+        else:
+            len_q = enc_seq.shape[1]
+            slf_attn_mask = layers.triu(
+                layers.ones(
+                    shape=[len_q, len_q], dtype=enc_seq.dtype),
+                diagonal=1)
+            slf_attn_mask = layers.cast(
+                slf_attn_mask != 0, dtype=enc_seq.dtype) * -1e30
+
+        non_pad_mask = get_non_pad_mask(enc_pos, 1, enc_seq.dtype)
 
         # -- Forward
         dec_output = enc_seq + self.position_enc(enc_pos)
diff --git a/parakeet/models/fastspeech/encoder.py b/parakeet/models/fastspeech/encoder.py
index d39fdc1..97ea75e 100644
--- a/parakeet/models/fastspeech/encoder.py
+++ b/parakeet/models/fastspeech/encoder.py
@@ -76,7 +76,7 @@ class Encoder(dg.Layer):
         for i, layer in enumerate(self.layer_stack):
             self.add_sublayer('fft_{}'.format(i), layer)
 
-    def forward(self, character, text_pos, non_pad_mask, slf_attn_mask=None):
+    def forward(self, character, text_pos):
         """
         Encode text sequence.
 
@@ -84,22 +84,21 @@ class Encoder(dg.Layer):
             character (Variable): shape(B, T_text), dtype float32, the input text characters, 
                 where T_text means the timesteps of input characters,
             text_pos (Variable): shape(B, T_text), dtype int64, the input text position. 
-            non_pad_mask (Variable): shape(B, T_text, 1), dtype int64, the mask with non pad.
-            slf_attn_mask (Variable, optional): shape(B, T_text, T_text), dtype int64, 
-                the mask of input characters. Defaults to None.
         
         Returns:
             enc_output (Variable): shape(B, T_text, C), the encoder output. 
-            non_pad_mask (Variable): shape(B, T_text, 1), the mask with non pad.
             enc_slf_attn_list (list[Variable]): len(n_layers), the encoder self attention list.
         """
         enc_slf_attn_list = []
-        slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])
 
         # -- Forward
         enc_output = self.src_word_emb(character) + self.position_enc(
             text_pos)  #(N, T, C)
 
+        slf_attn_mask = get_attn_key_pad_mask(text_pos, self.n_head,
+                                              enc_output.dtype)
+        non_pad_mask = get_non_pad_mask(text_pos, 1, enc_output.dtype)
+
         for enc_layer in self.layer_stack:
             enc_output, enc_slf_attn = enc_layer(
                 enc_output,
diff --git a/parakeet/models/fastspeech/fastspeech.py b/parakeet/models/fastspeech/fastspeech.py
index 42e9c67..db2fca5 100644
--- a/parakeet/models/fastspeech/fastspeech.py
+++ b/parakeet/models/fastspeech/fastspeech.py
@@ -86,11 +86,7 @@ class FastSpeech(dg.Layer):
     def forward(self,
                 character,
                 text_pos,
-                enc_non_pad_mask,
-                dec_non_pad_mask,
                 mel_pos=None,
-                enc_slf_attn_mask=None,
-                dec_slf_attn_mask=None,
                 length_target=None,
                 alpha=1.0):
         """
@@ -102,12 +98,6 @@ class FastSpeech(dg.Layer):
             text_pos (Variable): shape(B, T_text), dtype int64, the input text position. 
             mel_pos (Variable, optional): shape(B, T_mel), dtype int64, the spectrum position, 
                 where T_mel means the timesteps of input spectrum,  
-            enc_non_pad_mask (Variable): shape(B, T_text, 1), dtype int64, the mask with non pad.
-            dec_non_pad_mask (Variable): shape(B, T_mel, 1), dtype int64, the mask with non pad.
-            enc_slf_attn_mask (Variable, optional): shape(B, T_text, T_text), dtype int64, 
-                the mask of input characters. Defaults to None.
-            slf_attn_mask (Variable, optional): shape(B, T_mel, T_mel), dtype int64,
-                the mask of mel spectrum. Defaults to None.
             length_target (Variable, optional): shape(B, T_text), dtype int64, 
                 the duration of phoneme compute from pretrained transformerTTS. Defaults to None. 
             alpha (float32, optional): The hyperparameter to determine the length of the expanded sequence 
@@ -121,19 +111,12 @@ class FastSpeech(dg.Layer):
             dec_slf_attn_list (List[Variable]): len(dec_n_layers), the decoder self attention list.
         """
 
-        encoder_output, enc_slf_attn_list = self.encoder(
-            character,
-            text_pos,
-            enc_non_pad_mask,
-            slf_attn_mask=enc_slf_attn_mask)
+        encoder_output, enc_slf_attn_list = self.encoder(character, text_pos)
         if fluid.framework._dygraph_tracer()._train_mode:
             length_regulator_output, duration_predictor_output = self.length_regulator(
                 encoder_output, target=length_target, alpha=alpha)
             decoder_output, dec_slf_attn_list = self.decoder(
-                length_regulator_output,
-                mel_pos,
-                dec_non_pad_mask,
-                slf_attn_mask=dec_slf_attn_mask)
+                length_regulator_output, mel_pos)
 
             mel_output = self.mel_linear(decoder_output)
             mel_output_postnet = self.postnet(mel_output) + mel_output
@@ -142,19 +125,8 @@ class FastSpeech(dg.Layer):
         else:
             length_regulator_output, decoder_pos = self.length_regulator(
                 encoder_output, alpha=alpha)
-            slf_attn_mask = get_triu_tensor(
-                decoder_pos.numpy(), decoder_pos.numpy()).astype(np.float32)
-            slf_attn_mask = np.expand_dims(slf_attn_mask, axis=0)
-            slf_attn_mask = fluid.layers.cast(
-                dg.to_variable(slf_attn_mask != 0), np.float32) * (-2**32 + 1)
-            slf_attn_mask = dg.to_variable(slf_attn_mask)
-            dec_non_pad_mask = fluid.layers.unsqueeze(
-                (decoder_pos != 0).astype(np.float32), [-1])
-            decoder_output, _ = self.decoder(
-                length_regulator_output,
-                decoder_pos,
-                dec_non_pad_mask,
-                slf_attn_mask=slf_attn_mask)
+            decoder_output, _ = self.decoder(length_regulator_output,
+                                             decoder_pos)
             mel_output = self.mel_linear(decoder_output)
             mel_output_postnet = self.postnet(mel_output) + mel_output