ParakeetEricRoss/parakeet/models/fastspeech/decoder.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock


class Decoder(dg.Layer):
    def __init__(self,
                 len_max_seq,
                 n_layers,
                 n_head,
                 d_k,
                 d_v,
                 d_model,
                 d_inner,
                 fft_conv1d_kernel,
                 fft_conv1d_padding,
                 dropout=0.1):
        super(Decoder, self).__init__()

        n_position = len_max_seq + 1
        self.n_head = n_head
        self.pos_inp = get_sinusoid_encoding_table(
            n_position, d_model, padding_idx=0)
        self.position_enc = dg.Embedding(
            size=[n_position, d_model],
            padding_idx=0,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.NumpyArrayInitializer(
                    self.pos_inp),
                trainable=False))
        self.layer_stack = [
            FFTBlock(
                d_model,
                d_inner,
                n_head,
                d_k,
                d_v,
                fft_conv1d_kernel,
                fft_conv1d_padding,
                dropout=dropout) for _ in range(n_layers)
        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)

    def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None):
        """
        Decoder layer of FastSpeech.
        Args:
            enc_seq (Variable): The output of length regulator.
                Shape: (B, T_text, C), T_text means the timesteps of input text, 
                dtype: float32. 
            enc_pos (Variable): The spectrum position. 
                Shape: (B, T_mel), T_mel means the timesteps of input spectrum, 
                dtype: int64.
            non_pad_mask (Variable): the mask with non pad.
                Shape: (B, T_mel, 1),
                dtype: int64.
            slf_attn_mask (Variable, optional): the mask of mel spectrum. Defaults to None.
                Shape: (B, T_mel, T_mel),
                dtype: int64.

        Returns:
            dec_output (Variable): the decoder output.
                Shape: (B, T_mel, C).
            dec_slf_attn_list (list[Variable]): the decoder self attention list.
                Len: n_layers.
        """
        dec_slf_attn_list = []
        slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])

        # -- Forward
        dec_output = enc_seq + self.position_enc(enc_pos)

        for dec_layer in self.layer_stack:
            dec_output, dec_slf_attn = dec_layer(
                dec_output,
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            dec_slf_attn_list += [dec_slf_attn]

        return dec_output, dec_slf_attn_list
add license 2020-02-26 21:03:51 +08:00			`# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`import paddle.fluid.dygraph as dg`
			`import paddle.fluid as fluid`
remove utils.py to models/transdformer_tts 2020-02-20 10:17:25 +08:00			`from parakeet.models.transformer_tts.utils import *`
move data.py and rename some files 2020-02-13 20:46:21 +08:00			`from parakeet.models.fastspeech.fft_block import FFTBlock`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00
add license 2020-02-26 21:03:51 +08:00
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`class Decoder(dg.Layer):`
			`def __init__(self,`
			`len_max_seq,`
			`n_layers,`
			`n_head,`
			`d_k,`
			`d_v,`
			`d_model,`
			`d_inner,`
			`fft_conv1d_kernel,`
			`fft_conv1d_padding,`
			`dropout=0.1):`
			`super(Decoder, self).__init__()`

			`n_position = len_max_seq + 1`
Modified data.py to generate masks as models inputs 2020-03-05 15:08:12 +08:00			`self.n_head = n_head`
add license 2020-02-26 21:03:51 +08:00			`self.pos_inp = get_sinusoid_encoding_table(`
			`n_position, d_model, padding_idx=0)`
			`self.position_enc = dg.Embedding(`
			`size=[n_position, d_model],`
			`padding_idx=0,`
			`param_attr=fluid.ParamAttr(`
			`initializer=fluid.initializer.NumpyArrayInitializer(`
			`self.pos_inp),`
			`trainable=False))`
			`self.layer_stack = [`
			`FFTBlock(`
			`d_model,`
			`d_inner,`
			`n_head,`
			`d_k,`
			`d_v,`
			`fft_conv1d_kernel,`
			`fft_conv1d_padding,`
			`dropout=dropout) for _ in range(n_layers)`
			`]`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`for i, layer in enumerate(self.layer_stack):`
			`self.add_sublayer('fft_{}'.format(i), layer)`
add license 2020-02-26 21:03:51 +08:00
Modified data.py to generate masks as models inputs 2020-03-05 15:08:12 +08:00			`def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None):`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`"""`
			`Decoder layer of FastSpeech.`
			`Args:`
add docstring for transformer_tts and fastspeech 2020-03-09 15:16:02 +08:00			`enc_seq (Variable): The output of length regulator.`
			`Shape: (B, T_text, C), T_text means the timesteps of input text,`
			`dtype: float32.`
			`enc_pos (Variable): The spectrum position.`
			`Shape: (B, T_mel), T_mel means the timesteps of input spectrum,`
			`dtype: int64.`
			`non_pad_mask (Variable): the mask with non pad.`
			`Shape: (B, T_mel, 1),`
			`dtype: int64.`
			`slf_attn_mask (Variable, optional): the mask of mel spectrum. Defaults to None.`
			`Shape: (B, T_mel, T_mel),`
			`dtype: int64.`

add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`Returns:`
add docstring for transformer_tts and fastspeech 2020-03-09 15:16:02 +08:00			`dec_output (Variable): the decoder output.`
			`Shape: (B, T_mel, C).`
			`dec_slf_attn_list (list[Variable]): the decoder self attention list.`
			`Len: n_layers.`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`"""`
			`dec_slf_attn_list = []`
Modified data.py to generate masks as models inputs 2020-03-05 15:08:12 +08:00			`slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00
			`# -- Forward`
			`dec_output = enc_seq + self.position_enc(enc_pos)`

			`for dec_layer in self.layer_stack:`
			`dec_output, dec_slf_attn = dec_layer(`
			`dec_output,`
			`non_pad_mask=non_pad_mask,`
			`slf_attn_mask=slf_attn_mask)`
			`dec_slf_attn_list += [dec_slf_attn]`

add license 2020-02-26 21:03:51 +08:00			`return dec_output, dec_slf_attn_list`