ParakeetEricRoss/parakeet/models/fastspeech/fft_block.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward


class FFTBlock(dg.Layer):
    def __init__(self,
                 d_model,
                 d_inner,
                 n_head,
                 d_k,
                 d_v,
                 filter_size,
                 padding,
                 dropout=0.2):
        super(FFTBlock, self).__init__()
        self.slf_attn = MultiheadAttention(
            d_model,
            d_k,
            d_v,
            num_head=n_head,
            is_bias=True,
            dropout=dropout,
            is_concat=False)
        self.pos_ffn = PositionwiseFeedForward(
            d_model,
            d_inner,
            filter_size=filter_size,
            padding=padding,
            dropout=dropout)

    def forward(self, enc_input, non_pad_mask, slf_attn_mask=None):
        """
        Feed Forward Transformer block in FastSpeech.
        
        Args:
            enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input. 
                T means the timesteps of input.
            non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
            slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention. 
                len_q means the sequence length of query, len_k means the sequence length of key.

        Returns:
            output (Variable), Shape(B, T, C), the output after self-attention & ffn.
            slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
        """
        output, slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)

        output *= non_pad_mask

        output = self.pos_ffn(output)
        output *= non_pad_mask

        return output, slf_attn
add license 2020-02-26 21:03:51 +08:00			`# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`import numpy as np`
			`import math`
			`import paddle.fluid.dygraph as dg`
			`import paddle.fluid.layers as layers`
			`import paddle.fluid as fluid`
			`from parakeet.modules.multihead_attention import MultiheadAttention`
Adjust the directory structure 2020-02-11 16:57:30 +08:00			`from parakeet.modules.ffn import PositionwiseFeedForward`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00
add license 2020-02-26 21:03:51 +08:00
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`class FFTBlock(dg.Layer):`
add license 2020-02-26 21:03:51 +08:00			`def __init__(self,`
			`d_model,`
			`d_inner,`
			`n_head,`
			`d_k,`
			`d_v,`
			`filter_size,`
			`padding,`
			`dropout=0.2):`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`super(FFTBlock, self).__init__()`
add license 2020-02-26 21:03:51 +08:00			`self.slf_attn = MultiheadAttention(`
			`d_model,`
			`d_k,`
			`d_v,`
			`num_head=n_head,`
			`is_bias=True,`
			`dropout=dropout,`
			`is_concat=False)`
			`self.pos_ffn = PositionwiseFeedForward(`
			`d_model,`
			`d_inner,`
			`filter_size=filter_size,`
			`padding=padding,`
			`dropout=dropout)`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00
Modified data.py to generate masks as models inputs 2020-03-05 15:08:12 +08:00			`def forward(self, enc_input, non_pad_mask, slf_attn_mask=None):`
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`"""`
			`Feed Forward Transformer block in FastSpeech.`

			`Args:`
			`enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.`
			`T means the timesteps of input.`
			`non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.`
			`slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.`
			`len_q means the sequence length of query, len_k means the sequence length of key.`

			`Returns:`
			`output (Variable), Shape(B, T, C), the output after self-attention & ffn.`
			`slf_attn (Variable), Shape(B * n_head, T, T), the self attention.`
			`"""`
add license 2020-02-26 21:03:51 +08:00			`output, slf_attn = self.slf_attn(`
			`enc_input, enc_input, enc_input, mask=slf_attn_mask)`
Modified data.py to generate masks as models inputs 2020-03-05 15:08:12 +08:00
add transformerTTS and fastspeech 2020-02-10 15:38:29 +08:00			`output *= non_pad_mask`

			`output = self.pos_ffn(output)`
			`output *= non_pad_mask`

add license 2020-02-26 21:03:51 +08:00			`return output, slf_attn`