format docstrings
This commit is contained in:
parent
3af3c29a94
commit
6553d1d723
|
@ -12,28 +12,26 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Fastspeech2 related modules for paddle"""
|
"""Fastspeech2 related modules for paddle"""
|
||||||
import logging
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from typing import Sequence
|
from typing import Sequence
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from typeguard import check_argument_types
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
import numpy as np
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
|
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
|
||||||
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
|
from parakeet.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
|
||||||
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
|
|
||||||
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
|
from parakeet.modules.fastspeech2_predictor.length_regulator import LengthRegulator
|
||||||
from parakeet.modules.fastspeech2_predictor.postnet import Postnet
|
from parakeet.modules.fastspeech2_predictor.postnet import Postnet
|
||||||
from parakeet.modules.nets_utils import make_non_pad_mask
|
from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
|
||||||
from parakeet.modules.nets_utils import make_pad_mask
|
|
||||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
||||||
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
|
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
|
||||||
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
|
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
|
||||||
|
from parakeet.modules.nets_utils import initialize
|
||||||
|
from parakeet.modules.nets_utils import make_non_pad_mask
|
||||||
|
from parakeet.modules.nets_utils import make_pad_mask
|
||||||
|
|
||||||
|
|
||||||
class FastSpeech2(nn.Layer):
|
class FastSpeech2(nn.Layer):
|
||||||
|
@ -155,7 +153,6 @@ class FastSpeech2(nn.Layer):
|
||||||
positionwise_layer_type=positionwise_layer_type,
|
positionwise_layer_type=positionwise_layer_type,
|
||||||
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
|
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
|
||||||
else:
|
else:
|
||||||
print("encoder_type:", encoder_type)
|
|
||||||
raise ValueError(f"{encoder_type} is not supported.")
|
raise ValueError(f"{encoder_type} is not supported.")
|
||||||
|
|
||||||
# define duration predictor
|
# define duration predictor
|
||||||
|
@ -236,6 +233,12 @@ class FastSpeech2(nn.Layer):
|
||||||
use_batch_norm=use_batch_norm,
|
use_batch_norm=use_batch_norm,
|
||||||
dropout_rate=postnet_dropout_rate, ))
|
dropout_rate=postnet_dropout_rate, ))
|
||||||
|
|
||||||
|
# initialize parameters
|
||||||
|
self._reset_parameters(
|
||||||
|
init_type=init_type,
|
||||||
|
init_enc_alpha=init_enc_alpha,
|
||||||
|
init_dec_alpha=init_dec_alpha, )
|
||||||
|
|
||||||
# define criterions
|
# define criterions
|
||||||
self.criterion = FastSpeech2Loss(
|
self.criterion = FastSpeech2Loss(
|
||||||
use_masking=use_masking, use_weighted_masking=use_weighted_masking)
|
use_masking=use_masking, use_weighted_masking=use_weighted_masking)
|
||||||
|
@ -253,25 +256,37 @@ class FastSpeech2(nn.Layer):
|
||||||
energy: paddle.Tensor,
|
energy: paddle.Tensor,
|
||||||
energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[
|
energy_lengths: paddle.Tensor, ) -> Tuple[paddle.Tensor, Dict[
|
||||||
str, paddle.Tensor], paddle.Tensor]:
|
str, paddle.Tensor], paddle.Tensor]:
|
||||||
# """Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
# Args:
|
Parameters
|
||||||
# text (LongTensor): Batch of padded token ids (B, Tmax).
|
----------
|
||||||
# text_lengths (LongTensor): Batch of lengths of each input (B,).
|
text : LongTensor
|
||||||
# speech (Tensor): Batch of padded target features (B, Lmax, odim).
|
Batch of padded token ids (B, Tmax).
|
||||||
# speech_lengths (LongTensor): Batch of the lengths of each target (B,).
|
text_lengths : LongTensor)
|
||||||
# durations (LongTensor): Batch of padded durations (B, Tmax + 1).
|
Batch of lengths of each input (B,).
|
||||||
# durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1).
|
speech : Tensor
|
||||||
# pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
|
Batch of padded target features (B, Lmax, odim).
|
||||||
# pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1).
|
speech_lengths : LongTensor
|
||||||
# energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
|
Batch of the lengths of each target (B,).
|
||||||
# energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1).
|
durations : LongTensor
|
||||||
# Returns:
|
Batch of padded durations (B, Tmax + 1).
|
||||||
# Tensor: Loss scalar value.
|
durations_lengths : LongTensor
|
||||||
# Dict: Statistics to be monitored.
|
Batch of duration lengths (B, Tmax + 1).
|
||||||
# Tensor: Weight value.
|
pitch : Tensor
|
||||||
|
Batch of padded token-averaged pitch (B, Tmax + 1, 1).
|
||||||
# """
|
pitch_lengths : LongTensor
|
||||||
|
Batch of pitch lengths (B, Tmax + 1).
|
||||||
|
energy : Tensor
|
||||||
|
Batch of padded token-averaged energy (B, Tmax + 1, 1).
|
||||||
|
energy_lengths : LongTensor
|
||||||
|
Batch of energy lengths (B, Tmax + 1).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
Tensor
|
||||||
|
Loss scalar value.
|
||||||
|
Dict
|
||||||
|
Statistics to be monitored.
|
||||||
|
"""
|
||||||
text = text[:, :text_lengths.max()] # for data-parallel
|
text = text[:, :text_lengths.max()] # for data-parallel
|
||||||
speech = speech[:, :speech_lengths.max()] # for data-parallel
|
speech = speech[:, :speech_lengths.max()] # for data-parallel
|
||||||
durations = durations[:, :durations_lengths.max()] # for data-parallel
|
durations = durations[:, :durations_lengths.max()] # for data-parallel
|
||||||
|
@ -282,16 +297,11 @@ class FastSpeech2(nn.Layer):
|
||||||
|
|
||||||
# Add eos at the last of sequence
|
# Add eos at the last of sequence
|
||||||
# xs = F.pad(text, [0, 1], "constant", self.padding_idx)
|
# xs = F.pad(text, [0, 1], "constant", self.padding_idx)
|
||||||
print("xs.shape in fastspeech2.py before:", text.shape, text)
|
|
||||||
xs = np.pad(text.numpy(),
|
xs = np.pad(text.numpy(),
|
||||||
pad_width=((0, 0), (0, 1)),
|
pad_width=((0, 0), (0, 1)),
|
||||||
mode="constant",
|
mode="constant",
|
||||||
constant_values=self.padding_idx)
|
constant_values=self.padding_idx)
|
||||||
xs = paddle.to_tensor(xs)
|
xs = paddle.to_tensor(xs)
|
||||||
print("xs.shape in fastspeech2.py end:", xs.shape, xs)
|
|
||||||
# my_pad = nn.Pad1D(padding=[0, 1], mode="constant", value=self.padding_idx)
|
|
||||||
# xs = my_pad(text)
|
|
||||||
# 是否会数组越界? xs 是否能取到 l -> 可以,因为上一步补充了一个 padding_idx,又变成了 eos
|
|
||||||
for i, l in enumerate(text_lengths):
|
for i, l in enumerate(text_lengths):
|
||||||
xs[i, l] = self.eos
|
xs[i, l] = self.eos
|
||||||
ilens = text_lengths + 1
|
ilens = text_lengths + 1
|
||||||
|
@ -302,23 +312,16 @@ class FastSpeech2(nn.Layer):
|
||||||
# forward propagation
|
# forward propagation
|
||||||
before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
|
before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
|
||||||
xs, ilens, ys, olens, ds, ps, es, is_inference=False)
|
xs, ilens, ys, olens, ds, ps, es, is_inference=False)
|
||||||
print("d_outs in paddle:", d_outs)
|
|
||||||
print("p_outs in paddle:", p_outs)
|
|
||||||
print("e_outs in paddle:", e_outs)
|
|
||||||
|
|
||||||
# modify mod part of groundtruth
|
# modify mod part of groundtruth
|
||||||
if self.reduction_factor > 1:
|
if self.reduction_factor > 1:
|
||||||
# 需要改
|
|
||||||
olens = paddle.to_tensor([
|
olens = paddle.to_tensor([
|
||||||
olen - olen % self.reduction_factor for olen in olens.numpy()
|
olen - olen % self.reduction_factor for olen in olens.numpy()
|
||||||
])
|
])
|
||||||
max_olen = max(olens)
|
max_olen = max(olens)
|
||||||
ys = ys[:, :max_olen]
|
ys = ys[:, :max_olen]
|
||||||
|
|
||||||
# calculate loss
|
# calculate loss
|
||||||
if self.postnet is None:
|
if self.postnet is None:
|
||||||
after_outs = None
|
after_outs = None
|
||||||
|
|
||||||
# calculate loss
|
# calculate loss
|
||||||
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
|
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
|
||||||
after_outs=after_outs,
|
after_outs=after_outs,
|
||||||
|
@ -363,9 +366,8 @@ class FastSpeech2(nn.Layer):
|
||||||
alpha: float=1.0, ) -> Sequence[paddle.Tensor]:
|
alpha: float=1.0, ) -> Sequence[paddle.Tensor]:
|
||||||
# forward encoder
|
# forward encoder
|
||||||
x_masks = self._source_mask(ilens)
|
x_masks = self._source_mask(ilens)
|
||||||
print("xs.shape in fastspeech2.py:", xs.shape)
|
|
||||||
hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim)
|
|
||||||
|
|
||||||
|
hs, _ = self.encoder(xs, x_masks) # (B, Tmax, adim)
|
||||||
# forward duration predictor and variance predictors
|
# forward duration predictor and variance predictors
|
||||||
d_masks = make_pad_mask(ilens)
|
d_masks = make_pad_mask(ilens)
|
||||||
|
|
||||||
|
@ -377,10 +379,11 @@ class FastSpeech2(nn.Layer):
|
||||||
e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
|
e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
|
||||||
else:
|
else:
|
||||||
e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
|
e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
|
||||||
print("p_outs.shape:", p_outs.shape)
|
|
||||||
if is_inference:
|
if is_inference:
|
||||||
d_outs = self.duration_predictor.inference(hs,
|
d_outs = self.duration_predictor.inference(hs,
|
||||||
d_masks) # (B, Tmax)
|
d_masks) # (B, Tmax)
|
||||||
|
# print("d_outs:",d_outs)
|
||||||
# use prediction in inference
|
# use prediction in inference
|
||||||
# (B, Tmax, 1)
|
# (B, Tmax, 1)
|
||||||
|
|
||||||
|
@ -404,7 +407,6 @@ class FastSpeech2(nn.Layer):
|
||||||
# forward decoder
|
# forward decoder
|
||||||
if olens is not None and not is_inference:
|
if olens is not None and not is_inference:
|
||||||
if self.reduction_factor > 1:
|
if self.reduction_factor > 1:
|
||||||
# 直接to_paddle ,维度会增加 1,需要先转成 numpy
|
|
||||||
olens_in = paddle.to_tensor(
|
olens_in = paddle.to_tensor(
|
||||||
[olen // self.reduction_factor for olen in olens.numpy()])
|
[olen // self.reduction_factor for olen in olens.numpy()])
|
||||||
else:
|
else:
|
||||||
|
@ -412,9 +414,10 @@ class FastSpeech2(nn.Layer):
|
||||||
h_masks = self._source_mask(olens_in)
|
h_masks = self._source_mask(olens_in)
|
||||||
else:
|
else:
|
||||||
h_masks = None
|
h_masks = None
|
||||||
zs, _ = self.decoder(hs, h_masks) # (B, Lmax, adim)
|
# (B, Lmax, adim)
|
||||||
before_outs = self.feat_out(zs).reshape(
|
zs, _ = self.decoder(hs, h_masks)
|
||||||
(zs.shape[0], -1, self.odim)) # (B, Lmax, odim)
|
# (B, Lmax, odim)
|
||||||
|
before_outs = self.feat_out(zs).reshape((zs.shape[0], -1, self.odim))
|
||||||
|
|
||||||
# postnet -> (B, Lmax//r * r, odim)
|
# postnet -> (B, Lmax//r * r, odim)
|
||||||
if self.postnet is None:
|
if self.postnet is None:
|
||||||
|
@ -437,20 +440,30 @@ class FastSpeech2(nn.Layer):
|
||||||
paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
"""Generate the sequence of features given the sequences of characters.
|
"""Generate the sequence of features given the sequences of characters.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
text (LongTensor): Input sequence of characters (T,).
|
----------
|
||||||
speech (Tensor, optional): Feature sequence to extract style (N, idim).
|
text : LongTensor
|
||||||
durations (LongTensor, optional): Groundtruth of duration (T + 1,).
|
Input sequence of characters (T,).
|
||||||
pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
|
speech : Tensor, optional
|
||||||
energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
|
Feature sequence to extract style (N, idim).
|
||||||
alpha (float, optional): Alpha to control the speed.
|
durations : LongTensor, optional
|
||||||
use_teacher_forcing (bool, optional): Whether to use teacher forcing.
|
Groundtruth of duration (T + 1,).
|
||||||
|
pitch : Tensor, optional
|
||||||
|
Groundtruth of token-averaged pitch (T + 1, 1).
|
||||||
|
energy : Tensor, optional
|
||||||
|
Groundtruth of token-averaged energy (T + 1, 1).
|
||||||
|
alpha : float, optional
|
||||||
|
Alpha to control the speed.
|
||||||
|
use_teacher_forcing : bool, optional
|
||||||
|
Whether to use teacher forcing.
|
||||||
If true, groundtruth of duration, pitch and energy will be used.
|
If true, groundtruth of duration, pitch and energy will be used.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Output sequence of features (L, odim).
|
----------
|
||||||
None: Dummy for compatibility.
|
Tensor
|
||||||
None: Dummy for compatibility.
|
Output sequence of features (L, odim).
|
||||||
|
None
|
||||||
|
Dummy for compatibility.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
x, y = text, speech
|
x, y = text, speech
|
||||||
|
@ -460,13 +473,15 @@ class FastSpeech2(nn.Layer):
|
||||||
x = np.pad(text.numpy(),
|
x = np.pad(text.numpy(),
|
||||||
pad_width=((0, 1)),
|
pad_width=((0, 1)),
|
||||||
mode="constant",
|
mode="constant",
|
||||||
constant_values=self.padding_idx)
|
constant_values=self.eos)
|
||||||
|
|
||||||
x = paddle.to_tensor(x)
|
x = paddle.to_tensor(x)
|
||||||
|
|
||||||
# setup batch axis
|
# setup batch axis
|
||||||
ilens = paddle.to_tensor(
|
ilens = paddle.to_tensor(
|
||||||
[x.shape[0]], dtype=paddle.int64, place=x.place)
|
[x.shape[0]], dtype=paddle.int64, place=x.place)
|
||||||
xs, ys = x.unsqueeze(0), None
|
xs, ys = x.unsqueeze(0), None
|
||||||
|
|
||||||
if y is not None:
|
if y is not None:
|
||||||
ys = y.unsqueeze(0)
|
ys = y.unsqueeze(0)
|
||||||
|
|
||||||
|
@ -493,14 +508,19 @@ class FastSpeech2(nn.Layer):
|
||||||
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
|
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
|
||||||
"""Make masks for self-attention.
|
"""Make masks for self-attention.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
ilens (LongTensor): Batch of lengths (B,).
|
----------
|
||||||
|
ilens : LongTensor
|
||||||
|
Batch of lengths (B,).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Mask tensor for self-attention.
|
-------
|
||||||
|
Tensor
|
||||||
|
Mask tensor for self-attention.
|
||||||
dtype=paddle.bool
|
dtype=paddle.bool
|
||||||
|
|
||||||
Examples:
|
Examples
|
||||||
|
-------
|
||||||
>>> ilens = [5, 3]
|
>>> ilens = [5, 3]
|
||||||
>>> self._source_mask(ilens)
|
>>> self._source_mask(ilens)
|
||||||
tensor([[[1, 1, 1, 1, 1],
|
tensor([[[1, 1, 1, 1, 1],
|
||||||
|
@ -510,6 +530,29 @@ class FastSpeech2(nn.Layer):
|
||||||
x_masks = make_non_pad_mask(ilens)
|
x_masks = make_non_pad_mask(ilens)
|
||||||
return x_masks.unsqueeze(-2)
|
return x_masks.unsqueeze(-2)
|
||||||
|
|
||||||
|
def _reset_parameters(self,
|
||||||
|
init_type: str,
|
||||||
|
init_enc_alpha: float,
|
||||||
|
init_dec_alpha: float):
|
||||||
|
# initialize parameters
|
||||||
|
initialize(self, init_type)
|
||||||
|
|
||||||
|
# initialize alpha in scaled positional encoding
|
||||||
|
if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
|
||||||
|
init_enc_alpha = paddle.to_tensor(init_enc_alpha)
|
||||||
|
self.encoder.embed[-1].alpha = paddle.create_parameter(
|
||||||
|
shape=init_enc_alpha.shape,
|
||||||
|
dtype=str(init_enc_alpha.numpy().dtype),
|
||||||
|
default_initializer=paddle.nn.initializer.Assign(
|
||||||
|
init_enc_alpha))
|
||||||
|
if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
|
||||||
|
init_dec_alpha = paddle.to_tensor(init_dec_alpha)
|
||||||
|
self.decoder.embed[-1].alpha = paddle.create_parameter(
|
||||||
|
shape=init_dec_alpha.shape,
|
||||||
|
dtype=str(init_dec_alpha.numpy().dtype),
|
||||||
|
default_initializer=paddle.nn.initializer.Assign(
|
||||||
|
init_dec_alpha))
|
||||||
|
|
||||||
|
|
||||||
class FastSpeech2Loss(nn.Layer):
|
class FastSpeech2Loss(nn.Layer):
|
||||||
"""Loss function module for FastSpeech2."""
|
"""Loss function module for FastSpeech2."""
|
||||||
|
@ -519,12 +562,12 @@ class FastSpeech2Loss(nn.Layer):
|
||||||
use_weighted_masking: bool=False):
|
use_weighted_masking: bool=False):
|
||||||
"""Initialize feed-forward Transformer loss module.
|
"""Initialize feed-forward Transformer loss module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
use_masking (bool):
|
----------
|
||||||
|
use_masking : bool
|
||||||
Whether to apply masking for padded part in loss calculation.
|
Whether to apply masking for padded part in loss calculation.
|
||||||
use_weighted_masking (bool):
|
use_weighted_masking : bool
|
||||||
Whether to weighted masking in loss calculation.
|
Whether to weighted masking in loss calculation.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
assert check_argument_types()
|
assert check_argument_types()
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -555,24 +598,41 @@ class FastSpeech2Loss(nn.Layer):
|
||||||
paddle.Tensor, paddle.Tensor]:
|
paddle.Tensor, paddle.Tensor]:
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
|
----------
|
||||||
before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
|
after_outs : Tensor
|
||||||
d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
|
Batch of outputs after postnets (B, Lmax, odim).
|
||||||
p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
|
before_outs : Tensor
|
||||||
e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
|
Batch of outputs before postnets (B, Lmax, odim).
|
||||||
ys (Tensor): Batch of target features (B, Lmax, odim).
|
d_outs : LongTensor
|
||||||
ds (LongTensor): Batch of durations (B, Tmax).
|
Batch of outputs of duration predictor (B, Tmax).
|
||||||
ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
|
p_outs : Tensor
|
||||||
es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
|
Batch of outputs of pitch predictor (B, Tmax, 1).
|
||||||
ilens (LongTensor): Batch of the lengths of each input (B,).
|
e_outs : Tensor
|
||||||
olens (LongTensor): Batch of the lengths of each target (B,).
|
Batch of outputs of energy predictor (B, Tmax, 1).
|
||||||
|
ys : Tensor
|
||||||
|
Batch of target features (B, Lmax, odim).
|
||||||
|
ds : LongTensor
|
||||||
|
Batch of durations (B, Tmax).
|
||||||
|
ps : Tensor
|
||||||
|
Batch of target token-averaged pitch (B, Tmax, 1).
|
||||||
|
es : Tensor
|
||||||
|
Batch of target token-averaged energy (B, Tmax, 1).
|
||||||
|
ilens : LongTensor
|
||||||
|
Batch of the lengths of each input (B,).
|
||||||
|
olens : LongTensor
|
||||||
|
Batch of the lengths of each target (B,).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: L1 loss value.
|
----------
|
||||||
Tensor: Duration predictor loss value.
|
Tensor
|
||||||
Tensor: Pitch predictor loss value.
|
L1 loss value.
|
||||||
Tensor: Energy predictor loss value.
|
Tensor
|
||||||
|
Duration predictor loss value.
|
||||||
|
Tensor
|
||||||
|
Pitch predictor loss value.
|
||||||
|
Tensor
|
||||||
|
Energy predictor loss value.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# apply mask to remove padded part
|
# apply mask to remove padded part
|
||||||
|
|
|
@ -15,7 +15,6 @@
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
from parakeet.modules.layer_norm import LayerNorm
|
from parakeet.modules.layer_norm import LayerNorm
|
||||||
from parakeet.modules.masked_fill import masked_fill
|
from parakeet.modules.masked_fill import masked_fill
|
||||||
|
|
||||||
|
@ -31,7 +30,8 @@ class DurationPredictor(nn.Layer):
|
||||||
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
|
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
|
||||||
https://arxiv.org/pdf/1905.09263.pdf
|
https://arxiv.org/pdf/1905.09263.pdf
|
||||||
|
|
||||||
Note:
|
Note
|
||||||
|
----------
|
||||||
The calculation domain of outputs is different
|
The calculation domain of outputs is different
|
||||||
between in `forward` and in `inference`. In `forward`,
|
between in `forward` and in `inference`. In `forward`,
|
||||||
the outputs are calculated in log domain but in `inference`,
|
the outputs are calculated in log domain but in `inference`,
|
||||||
|
@ -48,13 +48,20 @@ class DurationPredictor(nn.Layer):
|
||||||
offset=1.0):
|
offset=1.0):
|
||||||
"""Initilize duration predictor module.
|
"""Initilize duration predictor module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
idim (int): Input dimension.
|
----------
|
||||||
n_layers (int, optional): Number of convolutional layers.
|
idim : int
|
||||||
n_chans (int, optional): Number of channels of convolutional layers.
|
Input dimension.
|
||||||
kernel_size (int, optional): Kernel size of convolutional layers.
|
n_layers : int, optional
|
||||||
dropout_rate (float, optional): Dropout rate.
|
Number of convolutional layers.
|
||||||
offset (float, optional): Offset value to avoid nan in log domain.
|
n_chans : int, optional
|
||||||
|
Number of channels of convolutional layers.
|
||||||
|
kernel_size : int, optional
|
||||||
|
Kernel size of convolutional layers.
|
||||||
|
dropout_rate : float, optional
|
||||||
|
Dropout rate.
|
||||||
|
offset : float, optional
|
||||||
|
Offset value to avoid nan in log domain.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
super(DurationPredictor, self).__init__()
|
super(DurationPredictor, self).__init__()
|
||||||
|
@ -74,7 +81,7 @@ class DurationPredictor(nn.Layer):
|
||||||
LayerNorm(
|
LayerNorm(
|
||||||
n_chans, dim=1),
|
n_chans, dim=1),
|
||||||
nn.Dropout(dropout_rate), ))
|
nn.Dropout(dropout_rate), ))
|
||||||
self.linear = nn.Linear(n_chans, 1)
|
self.linear = nn.Linear(n_chans, 1, bias_attr=True)
|
||||||
|
|
||||||
def _forward(self, xs, x_masks=None, is_inference=False):
|
def _forward(self, xs, x_masks=None, is_inference=False):
|
||||||
# (B, idim, Tmax)
|
# (B, idim, Tmax)
|
||||||
|
@ -99,28 +106,34 @@ class DurationPredictor(nn.Layer):
|
||||||
def forward(self, xs, x_masks=None):
|
def forward(self, xs, x_masks=None):
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
----------
|
||||||
x_masks (ByteTensor, optional):
|
xs : Tensor
|
||||||
|
Batch of input sequences (B, Tmax, idim).
|
||||||
|
x_masks : ByteTensor, optional
|
||||||
Batch of masks indicating padded part (B, Tmax).
|
Batch of masks indicating padded part (B, Tmax).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Batch of predicted durations in log domain (B, Tmax).
|
----------
|
||||||
|
Tensor
|
||||||
|
Batch of predicted durations in log domain (B, Tmax).
|
||||||
"""
|
"""
|
||||||
return self._forward(xs, x_masks, False)
|
return self._forward(xs, x_masks, False)
|
||||||
|
|
||||||
def inference(self, xs, x_masks=None):
|
def inference(self, xs, x_masks=None):
|
||||||
"""Inference duration.
|
"""Inference duration.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
----------
|
||||||
x_masks (ByteTensor, optional):
|
xs : Tensor
|
||||||
|
Batch of input sequences (B, Tmax, idim).
|
||||||
|
x_masks : Tensor(bool), optional
|
||||||
Batch of masks indicating padded part (B, Tmax).
|
Batch of masks indicating padded part (B, Tmax).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
LongTensor: Batch of predicted durations in linear domain int64 (B, Tmax).
|
----------
|
||||||
|
LongTensor
|
||||||
|
Batch of predicted durations in linear domain int64 (B, Tmax).
|
||||||
"""
|
"""
|
||||||
return self._forward(xs, x_masks, True)
|
return self._forward(xs, x_masks, True)
|
||||||
|
|
||||||
|
@ -135,10 +148,12 @@ class DurationPredictorLoss(nn.Layer):
|
||||||
def __init__(self, offset=1.0, reduction="mean"):
|
def __init__(self, offset=1.0, reduction="mean"):
|
||||||
"""Initilize duration predictor loss module.
|
"""Initilize duration predictor loss module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
offset (float, optional): Offset value to avoid nan in log domain.
|
----------
|
||||||
reduction (str): Reduction type in loss calculation.
|
offset : float, optional
|
||||||
|
Offset value to avoid nan in log domain.
|
||||||
|
reduction : str
|
||||||
|
Reduction type in loss calculation.
|
||||||
"""
|
"""
|
||||||
super(DurationPredictorLoss, self).__init__()
|
super(DurationPredictorLoss, self).__init__()
|
||||||
self.criterion = nn.MSELoss(reduction=reduction)
|
self.criterion = nn.MSELoss(reduction=reduction)
|
||||||
|
@ -147,16 +162,21 @@ class DurationPredictorLoss(nn.Layer):
|
||||||
def forward(self, outputs, targets):
|
def forward(self, outputs, targets):
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
outputs (Tensor): Batch of prediction durations in log domain (B, T)
|
----------
|
||||||
targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
|
outputs : Tensor
|
||||||
|
Batch of prediction durations in log domain (B, T)
|
||||||
|
targets : LongTensor
|
||||||
|
Batch of groundtruth durations in linear domain (B, T)
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Mean squared error loss value.
|
----------
|
||||||
|
Tensor
|
||||||
|
Mean squared error loss value.
|
||||||
|
|
||||||
Note:
|
Note
|
||||||
|
----------
|
||||||
`outputs` is in log domain but `targets` is in linear domain.
|
`outputs` is in log domain but `targets` is in linear domain.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# NOTE: outputs is in log domain while targets in linear
|
# NOTE: outputs is in log domain while targets in linear
|
||||||
targets = paddle.log(targets.cast(dtype='float32') + self.offset)
|
targets = paddle.log(targets.cast(dtype='float32') + self.offset)
|
||||||
|
|
|
@ -13,8 +13,6 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Length regulator related modules."""
|
"""Length regulator related modules."""
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
@ -37,8 +35,10 @@ class LengthRegulator(nn.Layer):
|
||||||
def __init__(self, pad_value=0.0):
|
def __init__(self, pad_value=0.0):
|
||||||
"""Initilize length regulator module.
|
"""Initilize length regulator module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
pad_value (float, optional): Value used for padding.
|
----------
|
||||||
|
pad_value : float, optional
|
||||||
|
Value used for padding.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -68,14 +68,19 @@ class LengthRegulator(nn.Layer):
|
||||||
def forward(self, xs, ds, alpha=1.0):
|
def forward(self, xs, ds, alpha=1.0):
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
|
----------
|
||||||
ds (LongTensor): Batch of durations of each frame (B, T).
|
xs : Tensor
|
||||||
alpha (float, optional): Alpha value to control speed of speech.
|
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
|
||||||
|
ds : LongTensor
|
||||||
Returns:
|
Batch of durations of each frame (B, T).
|
||||||
Tensor: replicated input tensor based on durations (B, T*, D).
|
alpha : float, optional
|
||||||
|
Alpha value to control speed of speech.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
Tensor
|
||||||
|
replicated input tensor based on durations (B, T*, D).
|
||||||
"""
|
"""
|
||||||
if alpha != 1.0:
|
if alpha != 1.0:
|
||||||
assert alpha > 0
|
assert alpha > 0
|
||||||
|
|
|
@ -43,15 +43,22 @@ class Postnet(nn.Layer):
|
||||||
use_batch_norm=True, ):
|
use_batch_norm=True, ):
|
||||||
"""Initialize postnet module.
|
"""Initialize postnet module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
idim (int): Dimension of the inputs.
|
----------
|
||||||
odim (int): Dimension of the outputs.
|
idim : int
|
||||||
n_layers (int, optional): The number of layers.
|
Dimension of the inputs.
|
||||||
n_filts (int, optional): The number of filter size.
|
odim : int
|
||||||
n_units (int, optional): The number of filter channels.
|
Dimension of the outputs.
|
||||||
use_batch_norm (bool, optional): Whether to use batch normalization..
|
n_layers : int, optional
|
||||||
dropout_rate (float, optional): Dropout rate..
|
The number of layers.
|
||||||
|
n_filts : int, optional
|
||||||
|
The number of filter size.
|
||||||
|
n_units : int, optional
|
||||||
|
The number of filter channels.
|
||||||
|
use_batch_norm : bool, optional
|
||||||
|
Whether to use batch normalization..
|
||||||
|
dropout_rate : float, optional
|
||||||
|
Dropout rate..
|
||||||
"""
|
"""
|
||||||
super(Postnet, self).__init__()
|
super(Postnet, self).__init__()
|
||||||
self.postnet = nn.LayerList()
|
self.postnet = nn.LayerList()
|
||||||
|
@ -111,11 +118,15 @@ class Postnet(nn.Layer):
|
||||||
def forward(self, xs):
|
def forward(self, xs):
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
|
----------
|
||||||
|
xs : Tensor
|
||||||
|
Batch of the sequences of padded input tensors (B, idim, Tmax).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Batch of padded output tensor. (B, odim, Tmax).
|
----------
|
||||||
|
Tensor
|
||||||
|
Batch of padded output tensor. (B, odim, Tmax).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for i in six.moves.range(len(self.postnet)):
|
for i in six.moves.range(len(self.postnet)):
|
||||||
|
|
|
@ -15,10 +15,8 @@
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
from parakeet.modules.layer_norm import LayerNorm
|
from parakeet.modules.layer_norm import LayerNorm
|
||||||
from parakeet.modules.masked_fill import masked_fill
|
from parakeet.modules.masked_fill import masked_fill
|
||||||
|
|
||||||
from typeguard import check_argument_types
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,13 +41,18 @@ class VariancePredictor(nn.Layer):
|
||||||
dropout_rate: float=0.5, ):
|
dropout_rate: float=0.5, ):
|
||||||
"""Initilize duration predictor module.
|
"""Initilize duration predictor module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
idim (int): Input dimension.
|
----------
|
||||||
n_layers (int, optional): Number of convolutional layers.
|
idim : int
|
||||||
n_chans (int, optional): Number of channels of convolutional layers.
|
Input dimension.
|
||||||
kernel_size (int, optional): Kernel size of convolutional layers.
|
n_layers : int, optional
|
||||||
dropout_rate (float, optional): Dropout rate.
|
Number of convolutional layers.
|
||||||
|
n_chans : int, optional
|
||||||
|
Number of channels of convolutional layers.
|
||||||
|
kernel_size : int, optional
|
||||||
|
Kernel size of convolutional layers.
|
||||||
|
dropout_rate : float, optional
|
||||||
|
Dropout rate.
|
||||||
"""
|
"""
|
||||||
assert check_argument_types()
|
assert check_argument_types()
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -70,26 +73,30 @@ class VariancePredictor(nn.Layer):
|
||||||
n_chans, dim=1),
|
n_chans, dim=1),
|
||||||
nn.Dropout(dropout_rate), ))
|
nn.Dropout(dropout_rate), ))
|
||||||
|
|
||||||
self.linear = nn.Linear(n_chans, 1)
|
self.linear = nn.Linear(n_chans, 1, bias_attr=True)
|
||||||
|
|
||||||
def forward(self, xs: paddle.Tensor,
|
def forward(self, xs: paddle.Tensor,
|
||||||
x_masks: paddle.Tensor=None) -> paddle.Tensor:
|
x_masks: paddle.Tensor=None) -> paddle.Tensor:
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (Tensor): Batch of input sequences (B, Tmax, idim).
|
----------
|
||||||
x_masks (ByteTensor, optional):
|
xs : Tensor
|
||||||
|
Batch of input sequences (B, Tmax, idim).
|
||||||
|
x_masks : Tensor(bool), optional
|
||||||
Batch of masks indicating padded part (B, Tmax, 1).
|
Batch of masks indicating padded part (B, Tmax, 1).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Batch of predicted sequences (B, Tmax, 1).
|
----------
|
||||||
|
Tensor
|
||||||
|
Batch of predicted sequences (B, Tmax, 1).
|
||||||
"""
|
"""
|
||||||
# (B, idim, Tmax)
|
# (B, idim, Tmax)
|
||||||
xs = xs.transpose([0, 2, 1])
|
xs = xs.transpose([0, 2, 1])
|
||||||
# (B, C, Tmax)
|
# (B, C, Tmax)
|
||||||
for f in self.conv:
|
for f in self.conv:
|
||||||
xs = f(xs) # (B, C, Tmax)
|
# (B, C, Tmax)
|
||||||
|
xs = f(xs)
|
||||||
# (B, Tmax, 1)
|
# (B, Tmax, 1)
|
||||||
xs = self.linear(xs.transpose([0, 2, 1]))
|
xs = self.linear(xs.transpose([0, 2, 1]))
|
||||||
|
|
||||||
|
|
|
@ -16,23 +16,22 @@
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
from paddle.fluid.layers import sequence_mask
|
|
||||||
|
|
||||||
from parakeet.modules.masked_fill import masked_fill
|
from parakeet.modules.masked_fill import masked_fill
|
||||||
|
|
||||||
|
|
||||||
class MultiHeadedAttention(nn.Layer):
|
class MultiHeadedAttention(nn.Layer):
|
||||||
"""Multi-Head Attention layer.
|
"""Multi-Head Attention layer.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
n_head (int): The number of heads.
|
----------
|
||||||
n_feat (int): The number of features.
|
n_head : int
|
||||||
dropout_rate (float): Dropout rate.
|
The number of heads.
|
||||||
|
n_feat : int
|
||||||
|
The number of features.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, n_head, n_feat, dropout_rate):
|
def __init__(self, n_head, n_feat, dropout_rate):
|
||||||
|
@ -42,33 +41,42 @@ class MultiHeadedAttention(nn.Layer):
|
||||||
# We assume d_v always equals d_k
|
# We assume d_v always equals d_k
|
||||||
self.d_k = n_feat // n_head
|
self.d_k = n_feat // n_head
|
||||||
self.h = n_head
|
self.h = n_head
|
||||||
self.linear_q = nn.Linear(n_feat, n_feat)
|
self.linear_q = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||||
self.linear_k = nn.Linear(n_feat, n_feat)
|
self.linear_k = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||||
self.linear_v = nn.Linear(n_feat, n_feat)
|
self.linear_v = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||||
self.linear_out = nn.Linear(n_feat, n_feat)
|
self.linear_out = nn.Linear(n_feat, n_feat, bias_attr=True)
|
||||||
self.attn = None
|
self.attn = None
|
||||||
self.dropout = nn.Dropout(p=dropout_rate)
|
self.dropout = nn.Dropout(p=dropout_rate)
|
||||||
|
|
||||||
def forward_qkv(self, query, key, value):
|
def forward_qkv(self, query, key, value):
|
||||||
"""Transform query, key and value.
|
"""Transform query, key and value.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
query (paddle.Tensor): Query tensor (#batch, time1, size).
|
----------
|
||||||
key (paddle.Tensor): Key tensor (#batch, time2, size).
|
query : paddle.Tensor
|
||||||
value (paddle.Tensor): Value tensor (#batch, time2, size).
|
query tensor (#batch, time1, size).
|
||||||
|
key : paddle.Tensor
|
||||||
Returns:
|
Key tensor (#batch, time2, size).
|
||||||
paddle.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
|
value : paddle.Tensor
|
||||||
paddle.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
|
Value tensor (#batch, time2, size).
|
||||||
paddle.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Transformed query tensor (#batch, n_head, time1, d_k).
|
||||||
|
paddle.Tensor
|
||||||
|
Transformed key tensor (#batch, n_head, time2, d_k).
|
||||||
|
paddle.Tensor
|
||||||
|
Transformed value tensor (#batch, n_head, time2, d_k).
|
||||||
"""
|
"""
|
||||||
n_batch = query.shape[0]
|
n_batch = query.shape[0]
|
||||||
|
|
||||||
q = paddle.reshape(
|
q = paddle.reshape(
|
||||||
self.linear_q(query), [n_batch, -1, self.h, self.d_k])
|
self.linear_q(query), [n_batch, -1, self.h, self.d_k])
|
||||||
k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
|
k = paddle.reshape(self.linear_k(key), [n_batch, -1, self.h, self.d_k])
|
||||||
v = paddle.reshape(
|
v = paddle.reshape(
|
||||||
self.linear_v(value), [n_batch, -1, self.h, self.d_k])
|
self.linear_v(value), [n_batch, -1, self.h, self.d_k])
|
||||||
|
|
||||||
# (batch, head, time1, d_k)
|
# (batch, head, time1, d_k)
|
||||||
q = q.transpose((0, 2, 1, 3))
|
q = q.transpose((0, 2, 1, 3))
|
||||||
# (batch, head, time2, d_k)
|
# (batch, head, time2, d_k)
|
||||||
|
@ -80,43 +88,39 @@ class MultiHeadedAttention(nn.Layer):
|
||||||
def forward_attention(self, value, scores, mask=None):
|
def forward_attention(self, value, scores, mask=None):
|
||||||
"""Compute attention context vector.
|
"""Compute attention context vector.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
value (paddle.Tensor): Transformed value (#batch, n_head, time2, d_k).
|
----------
|
||||||
scores (paddle.Tensor): Attention score (#batch, n_head, time1, time2).
|
value : paddle.Tensor
|
||||||
mask (paddle.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
|
Transformed value (#batch, n_head, time2, d_k).
|
||||||
|
scores : paddle.Tensor
|
||||||
|
Attention score (#batch, n_head, time1, time2).
|
||||||
|
mask : paddle.Tensor
|
||||||
|
Mask (#batch, 1, time2) or (#batch, time1, time2).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
paddle.Tensor: Transformed value (#batch, time1, d_model)
|
----------
|
||||||
|
paddle.Tensor:
|
||||||
|
Transformed value (#batch, time1, d_model)
|
||||||
weighted by the attention score (#batch, time1, time2).
|
weighted by the attention score (#batch, time1, time2).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
n_batch = value.shape[0]
|
n_batch = value.shape[0]
|
||||||
softmax = paddle.nn.Softmax(axis=-1)
|
softmax = paddle.nn.Softmax(axis=-1)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
|
|
||||||
mask = mask.unsqueeze(1)
|
mask = mask.unsqueeze(1)
|
||||||
# mask 取反, pad 的位置变成 true,之后 pad 的位置被替换为 0
|
|
||||||
mask = paddle.logical_not(mask)
|
mask = paddle.logical_not(mask)
|
||||||
|
|
||||||
# mask = paddle.cast(mask, dtype='int64')
|
|
||||||
# mask ==1 的位置用 min_value 代替
|
|
||||||
# scores = scores.masked_fill(mask, min_value)
|
|
||||||
min_value = float(
|
min_value = float(
|
||||||
numpy.finfo(
|
numpy.finfo(
|
||||||
paddle.to_tensor(
|
paddle.to_tensor(
|
||||||
0, dtype=scores.dtype).numpy().dtype).min)
|
0, dtype=scores.dtype).numpy().dtype).min)
|
||||||
|
|
||||||
scores = masked_fill(scores, mask, min_value)
|
scores = masked_fill(scores, mask, min_value)
|
||||||
self.attn = softmax(scores) # (batch, head, time1, time2)
|
# (batch, head, time1, time2)
|
||||||
|
self.attn = softmax(scores)
|
||||||
# 用value填充tensor中与mask中值为1位置相对应的元素 == 保留 mask 为0 的值
|
|
||||||
# self.attn = torch.softmax(scores, dim=-1).masked_fill(
|
|
||||||
# mask, 0.0
|
|
||||||
# ) # (batch, head, time1, time2)
|
|
||||||
# 保留 mask 为 0 的位置,其他变成 0
|
|
||||||
self.attn = masked_fill(self.attn, mask, 0.0)
|
self.attn = masked_fill(self.attn, mask, 0.0)
|
||||||
else:
|
else:
|
||||||
self.attn = softmax(scores) # (batch, head, time1, time2)
|
# (batch, head, time1, time2)
|
||||||
|
self.attn = softmax(scores)
|
||||||
# (batch, head, time1, time2)
|
# (batch, head, time1, time2)
|
||||||
p_attn = self.dropout(self.attn)
|
p_attn = self.dropout(self.attn)
|
||||||
# (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
|
# (batch, head, time1, time2) * (batch, head, time2, d_k) -> # (batch, head, time1, d_k)
|
||||||
|
@ -130,16 +134,21 @@ class MultiHeadedAttention(nn.Layer):
|
||||||
def forward(self, query, key, value, mask=None):
|
def forward(self, query, key, value, mask=None):
|
||||||
"""Compute scaled dot product attention.
|
"""Compute scaled dot product attention.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
query (paddle.Tensor): Query tensor (#batch, time1, size).
|
----------
|
||||||
key (paddle.Tensor): Key tensor (#batch, time2, size).
|
query : paddle.Tensor
|
||||||
value (paddle.Tensor): Value tensor (#batch, time2, size).
|
Query tensor (#batch, time1, size).
|
||||||
mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
|
key : paddle.Tensor
|
||||||
(#batch, time1, time2).
|
Key tensor (#batch, time2, size).
|
||||||
|
value : paddle.Tensor
|
||||||
Returns:
|
Value tensor (#batch, time2, size).
|
||||||
paddle.Tensor: Output tensor (#batch, time1, d_model).
|
mask : paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time1, d_model).
|
||||||
"""
|
"""
|
||||||
q, k, v = self.forward_qkv(query, key, value)
|
q, k, v = self.forward_qkv(query, key, value)
|
||||||
scores = paddle.matmul(q, k.transpose(
|
scores = paddle.matmul(q, k.transpose(
|
||||||
|
|
|
@ -22,14 +22,16 @@ from paddle import nn
|
||||||
class PositionalEncoding(nn.Layer):
|
class PositionalEncoding(nn.Layer):
|
||||||
"""Positional encoding.
|
"""Positional encoding.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
d_model (int): Embedding dimension.
|
----------
|
||||||
dropout_rate (float): Dropout rate.
|
d_model : int
|
||||||
max_len (int): Maximum input length.
|
Embedding dimension.
|
||||||
reverse (bool): Whether to reverse the input position. Only for
|
dropout_rate : float
|
||||||
the class LegacyRelPositionalEncoding. We remove it in the current
|
Dropout rate.
|
||||||
class RelPositionalEncoding.
|
max_len : int
|
||||||
|
Maximum input length.
|
||||||
|
reverse : bool
|
||||||
|
Whether to reverse the input position. Only for
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
|
def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
|
||||||
|
@ -47,7 +49,6 @@ class PositionalEncoding(nn.Layer):
|
||||||
|
|
||||||
pe = paddle.zeros([x.shape[1], self.d_model])
|
pe = paddle.zeros([x.shape[1], self.d_model])
|
||||||
if self.reverse:
|
if self.reverse:
|
||||||
# (x.shape[1],1)
|
|
||||||
position = paddle.arange(
|
position = paddle.arange(
|
||||||
x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1)
|
x.shape[1] - 1, -1, -1.0, dtype=paddle.float32).unsqueeze(1)
|
||||||
else:
|
else:
|
||||||
|
@ -65,12 +66,15 @@ class PositionalEncoding(nn.Layer):
|
||||||
def forward(self, x: paddle.Tensor):
|
def forward(self, x: paddle.Tensor):
|
||||||
"""Add positional encoding.
|
"""Add positional encoding.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (paddle.Tensor): Input tensor (batch, time, `*`).
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
Returns:
|
Input tensor (batch, time, `*`).
|
||||||
paddle.Tensor: Encoded tensor (batch, time, `*`).
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Encoded tensor (batch, time, `*`).
|
||||||
"""
|
"""
|
||||||
self.extend_pe(x)
|
self.extend_pe(x)
|
||||||
x = x * self.xscale + self.pe[:, :x.shape[1]]
|
x = x * self.xscale + self.pe[:, :x.shape[1]]
|
||||||
|
@ -82,11 +86,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
|
||||||
|
|
||||||
See Sec. 3.2 https://arxiv.org/abs/1809.08895
|
See Sec. 3.2 https://arxiv.org/abs/1809.08895
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
d_model (int): Embedding dimension.
|
----------
|
||||||
dropout_rate (float): Dropout rate.
|
d_model : int
|
||||||
max_len (int): Maximum input length.
|
Embedding dimension.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
max_len : int
|
||||||
|
Maximum input length.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, d_model, dropout_rate, max_len=5000):
|
def __init__(self, d_model, dropout_rate, max_len=5000):
|
||||||
|
@ -106,12 +113,15 @@ class ScaledPositionalEncoding(PositionalEncoding):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""Add positional encoding.
|
"""Add positional encoding.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (paddle.Tensor): Input tensor (batch, time, `*`).
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
Returns:
|
Input tensor (batch, time, `*`).
|
||||||
paddle.Tensor: Encoded tensor (batch, time, `*`).
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Encoded tensor (batch, time, `*`).
|
||||||
"""
|
"""
|
||||||
self.extend_pe(x)
|
self.extend_pe(x)
|
||||||
x = x + self.alpha * self.pe[:, :x.shape[1]]
|
x = x + self.alpha * self.pe[:, :x.shape[1]]
|
||||||
|
|
|
@ -12,19 +12,11 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import math
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import logging
|
import logging
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
from paddle.nn import functional as F
|
|
||||||
from paddle.nn import initializer as I
|
|
||||||
from paddle.fluid.layers import sequence_mask
|
|
||||||
import sys
|
|
||||||
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
||||||
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
|
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
|
||||||
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
|
|
||||||
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
|
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
|
||||||
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
|
from parakeet.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
|
||||||
from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
|
from parakeet.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||||
|
@ -35,28 +27,44 @@ from parakeet.modules.fastspeech2_transformer.repeat import repeat
|
||||||
class Encoder(nn.Layer):
|
class Encoder(nn.Layer):
|
||||||
"""Transformer encoder module.
|
"""Transformer encoder module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
idim (int): Input dimension.
|
----------
|
||||||
attention_dim (int): Dimention of attention.
|
idim : int
|
||||||
attention_heads (int): The number of heads of multi head attention.
|
Input dimension.
|
||||||
linear_units (int): The number of units of position-wise feed forward.
|
attention_dim : int
|
||||||
num_blocks (int): The number of decoder blocks.
|
Dimention of attention.
|
||||||
dropout_rate (float): Dropout rate.
|
attention_heads : int
|
||||||
positional_dropout_rate (float): Dropout rate after adding positional encoding.
|
The number of heads of multi head attention.
|
||||||
attention_dropout_rate (float): Dropout rate in attention.
|
linear_units : int
|
||||||
input_layer (Union[str, paddle.nn.Layer]): Input layer type.
|
The number of units of position-wise feed forward.
|
||||||
pos_enc_class (paddle.nn.Layer): Positional encoding module class.
|
num_blocks : int
|
||||||
|
The number of decoder blocks.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
positional_dropout_rate : float
|
||||||
|
Dropout rate after adding positional encoding.
|
||||||
|
attention_dropout_rate : float
|
||||||
|
Dropout rate in attention.
|
||||||
|
input_layer : Union[str, paddle.nn.Layer]
|
||||||
|
Input layer type.
|
||||||
|
pos_enc_class : paddle.nn.Layer
|
||||||
|
Positional encoding module class.
|
||||||
`PositionalEncoding `or `ScaledPositionalEncoding`
|
`PositionalEncoding `or `ScaledPositionalEncoding`
|
||||||
normalize_before (bool): Whether to use layer_norm before the first block.
|
normalize_before : bool
|
||||||
concat_after (bool): Whether to concat attention layer's input and output.
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
if True, additional linear will be applied.
|
if True, additional linear will be applied.
|
||||||
i.e. x -> x + linear(concat(x, att(x)))
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
|
positionwise_layer_type : str
|
||||||
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
|
"linear", "conv1d", or "conv1d-linear".
|
||||||
selfattention_layer_type (str): Encoder attention layer type.
|
positionwise_conv_kernel_size : int
|
||||||
padding_idx (int): Padding idx for input_layer=embed.
|
Kernel size of positionwise conv1d layer.
|
||||||
|
selfattention_layer_type : str
|
||||||
|
Encoder attention layer type.
|
||||||
|
padding_idx : int
|
||||||
|
Padding idx for input_layer=embed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -82,7 +90,8 @@ class Encoder(nn.Layer):
|
||||||
self.conv_subsampling_factor = 1
|
self.conv_subsampling_factor = 1
|
||||||
if input_layer == "linear":
|
if input_layer == "linear":
|
||||||
self.embed = nn.Sequential(
|
self.embed = nn.Sequential(
|
||||||
nn.Linear(idim, attention_dim),
|
nn.Linear(
|
||||||
|
idim, attention_dim, bias_attr=True),
|
||||||
nn.LayerNorm(attention_dim),
|
nn.LayerNorm(attention_dim),
|
||||||
nn.Dropout(dropout_rate),
|
nn.Dropout(dropout_rate),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
|
@ -169,14 +178,19 @@ class Encoder(nn.Layer):
|
||||||
def forward(self, xs, masks):
|
def forward(self, xs, masks):
|
||||||
"""Encode input sequence.
|
"""Encode input sequence.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (paddle.Tensor): Input tensor (#batch, time, idim).
|
----------
|
||||||
masks (paddle.Tensor): Mask tensor (#batch, time).
|
xs : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
Returns:
|
masks : paddle.Tensor
|
||||||
paddle.Tensor: Output tensor (#batch, time, attention_dim).
|
Mask tensor (#batch, time).
|
||||||
paddle.Tensor: Mask tensor (#batch, time).
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, attention_dim).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, time).
|
||||||
"""
|
"""
|
||||||
xs = self.embed(xs)
|
xs = self.embed(xs)
|
||||||
xs, masks = self.encoders(xs, masks)
|
xs, masks = self.encoders(xs, masks)
|
||||||
|
@ -187,16 +201,23 @@ class Encoder(nn.Layer):
|
||||||
def forward_one_step(self, xs, masks, cache=None):
|
def forward_one_step(self, xs, masks, cache=None):
|
||||||
"""Encode input frame.
|
"""Encode input frame.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (paddle.Tensor): Input tensor.
|
----------
|
||||||
masks (paddle.Tensor): Mask tensor.
|
xs : paddle.Tensor
|
||||||
cache (List[paddle.Tensor]): List of cache tensors.
|
Input tensor.
|
||||||
|
masks : paddle.Tensor
|
||||||
Returns:
|
Mask tensor.
|
||||||
paddle.Tensor: Output tensor.
|
cache : List[paddle.Tensor]
|
||||||
paddle.Tensor: Mask tensor.
|
List of cache tensors.
|
||||||
List[paddle.Tensor]: List of new cache tensors.
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor.
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor.
|
||||||
|
List[paddle.Tensor]
|
||||||
|
List of new cache tensors.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
xs = self.embed(xs)
|
xs = self.embed(xs)
|
||||||
|
|
|
@ -14,28 +14,31 @@
|
||||||
"""Encoder self-attention layer definition."""
|
"""Encoder self-attention layer definition."""
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
|
|
||||||
class EncoderLayer(nn.Layer):
|
class EncoderLayer(nn.Layer):
|
||||||
"""Encoder layer module.
|
"""Encoder layer module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
size (int): Input dimension.
|
----------
|
||||||
self_attn (paddle.nn.Layer): Self-attention module instance.
|
size : int
|
||||||
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
|
Input dimension.
|
||||||
can be used as the argument.
|
self_attn : paddle.nn.Layer
|
||||||
feed_forward (paddle.nn.Layer): Feed-forward module instance.
|
Self-attention module instance.
|
||||||
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
|
`MultiHeadedAttention` instance can be used as the argument.
|
||||||
can be used as the argument.
|
feed_forward : paddle.nn.Layer
|
||||||
dropout_rate (float): Dropout rate.
|
Feed-forward module instance.
|
||||||
normalize_before (bool): Whether to use layer_norm before the first block.
|
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
|
||||||
concat_after (bool): Whether to concat attention layer's input and output.
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
if True, additional linear will be applied.
|
if True, additional linear will be applied.
|
||||||
i.e. x -> x + linear(concat(x, att(x)))
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -57,20 +60,26 @@ class EncoderLayer(nn.Layer):
|
||||||
self.normalize_before = normalize_before
|
self.normalize_before = normalize_before
|
||||||
self.concat_after = concat_after
|
self.concat_after = concat_after
|
||||||
if self.concat_after:
|
if self.concat_after:
|
||||||
self.concat_linear = nn.Linear(size + size, size)
|
self.concat_linear = nn.Linear(size + size, size, bias_attr=True)
|
||||||
|
|
||||||
def forward(self, x, mask, cache=None):
|
def forward(self, x, mask, cache=None):
|
||||||
"""Compute encoded features.
|
"""Compute encoded features.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x_input (paddle.Tensor): Input tensor (#batch, time, size).
|
----------
|
||||||
mask (paddle.Tensor): Mask tensor for the input (#batch, time).
|
x_input : paddle.Tensor
|
||||||
cache (paddle.Tensor): Cache tensor of the input (#batch, time - 1, size).
|
Input tensor (#batch, time, size).
|
||||||
|
mask : paddle.Tensor
|
||||||
Returns:
|
Mask tensor for the input (#batch, time).
|
||||||
paddle.Tensor: Output tensor (#batch, time, size).
|
cache : paddle.Tensor
|
||||||
paddle.Tensor: Mask tensor (#batch, time).
|
Cache tensor of the input (#batch, time - 1, size).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, size).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, time).
|
||||||
"""
|
"""
|
||||||
residual = x
|
residual = x
|
||||||
if self.normalize_before:
|
if self.normalize_before:
|
||||||
|
@ -82,7 +91,6 @@ class EncoderLayer(nn.Layer):
|
||||||
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
|
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
|
||||||
x_q = x[:, -1:, :]
|
x_q = x[:, -1:, :]
|
||||||
residual = residual[:, -1:, :]
|
residual = residual[:, -1:, :]
|
||||||
# non-pad mask 变成 pad mask
|
|
||||||
mask = None if mask is None else mask[:, -1:, :]
|
mask = None if mask is None else mask[:, -1:, :]
|
||||||
|
|
||||||
if self.concat_after:
|
if self.concat_after:
|
||||||
|
@ -90,6 +98,7 @@ class EncoderLayer(nn.Layer):
|
||||||
(x, self.self_attn(x_q, x, x, mask)), axis=-1)
|
(x, self.self_attn(x_q, x, x, mask)), axis=-1)
|
||||||
x = residual + self.concat_linear(x_concat)
|
x = residual + self.concat_linear(x_concat)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
|
x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
|
||||||
if not self.normalize_before:
|
if not self.normalize_before:
|
||||||
x = self.norm1(x)
|
x = self.norm1(x)
|
||||||
|
|
|
@ -32,11 +32,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
|
||||||
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
|
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
|
||||||
"""Initialize MultiLayeredConv1d module.
|
"""Initialize MultiLayeredConv1d module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
in_chans (int): Number of input channels.
|
----------
|
||||||
hidden_chans (int): Number of hidden channels.
|
in_chans : int
|
||||||
kernel_size (int): Kernel size of conv1d.
|
Number of input channels.
|
||||||
dropout_rate (float): Dropout rate.
|
hidden_chans : int
|
||||||
|
Number of hidden channels.
|
||||||
|
kernel_size : int
|
||||||
|
Kernel size of conv1d.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
super(MultiLayeredConv1d, self).__init__()
|
super(MultiLayeredConv1d, self).__init__()
|
||||||
|
@ -58,14 +63,16 @@ class MultiLayeredConv1d(paddle.nn.Layer):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
Returns:
|
Batch of input tensors (B, T, in_chans).
|
||||||
paddle.Tensor: Batch of output tensors (B, T, in_chans).
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Batch of output tensors (B, T, in_chans).
|
||||||
"""
|
"""
|
||||||
# x = paddle.nn.ReLU(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
|
|
||||||
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
||||||
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
|
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
|
||||||
[0, 2, 1])
|
[0, 2, 1])
|
||||||
|
@ -81,12 +88,16 @@ class Conv1dLinear(paddle.nn.Layer):
|
||||||
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
|
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
|
||||||
"""Initialize Conv1dLinear module.
|
"""Initialize Conv1dLinear module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
in_chans (int): Number of input channels.
|
----------
|
||||||
hidden_chans (int): Number of hidden channels.
|
in_chans : int
|
||||||
kernel_size (int): Kernel size of conv1d.
|
Number of input channels.
|
||||||
dropout_rate (float): Dropout rate.
|
hidden_chans : int
|
||||||
|
Number of hidden channels.
|
||||||
|
kernel_size : int
|
||||||
|
Kernel size of conv1d.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
"""
|
"""
|
||||||
super(Conv1dLinear, self).__init__()
|
super(Conv1dLinear, self).__init__()
|
||||||
self.w_1 = paddle.nn.Conv1D(
|
self.w_1 = paddle.nn.Conv1D(
|
||||||
|
@ -95,18 +106,22 @@ class Conv1dLinear(paddle.nn.Layer):
|
||||||
kernel_size,
|
kernel_size,
|
||||||
stride=1,
|
stride=1,
|
||||||
padding=(kernel_size - 1) // 2, )
|
padding=(kernel_size - 1) // 2, )
|
||||||
self.w_2 = paddle.nn.Linear(hidden_chans, in_chans)
|
self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
|
||||||
self.dropout = paddle.nn.Dropout(dropout_rate)
|
self.dropout = paddle.nn.Dropout(dropout_rate)
|
||||||
self.relu = paddle.nn.ReLU()
|
self.relu = paddle.nn.ReLU()
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""Calculate forward propagation.
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (paddle.Tensor): Batch of input tensors (B, T, in_chans).
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Batch of input tensors (B, T, in_chans).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
paddle.Tensor: Batch of output tensors (B, T, in_chans).
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Batch of output tensors (B, T, in_chans).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
|
||||||
|
|
|
@ -19,11 +19,14 @@ import paddle
|
||||||
class PositionwiseFeedForward(paddle.nn.Layer):
|
class PositionwiseFeedForward(paddle.nn.Layer):
|
||||||
"""Positionwise feed forward layer.
|
"""Positionwise feed forward layer.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
idim (int): Input dimenstion.
|
----------
|
||||||
hidden_units (int): The number of hidden units.
|
idim : int
|
||||||
dropout_rate (float): Dropout rate.
|
Input dimenstion.
|
||||||
|
hidden_units : int
|
||||||
|
The number of hidden units.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
@ -33,8 +36,8 @@ class PositionwiseFeedForward(paddle.nn.Layer):
|
||||||
activation=paddle.nn.ReLU()):
|
activation=paddle.nn.ReLU()):
|
||||||
"""Construct an PositionwiseFeedForward object."""
|
"""Construct an PositionwiseFeedForward object."""
|
||||||
super(PositionwiseFeedForward, self).__init__()
|
super(PositionwiseFeedForward, self).__init__()
|
||||||
self.w_1 = paddle.nn.Linear(idim, hidden_units)
|
self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
|
||||||
self.w_2 = paddle.nn.Linear(hidden_units, idim)
|
self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
|
||||||
self.dropout = paddle.nn.Dropout(dropout_rate)
|
self.dropout = paddle.nn.Dropout(dropout_rate)
|
||||||
self.activation = activation
|
self.activation = activation
|
||||||
|
|
||||||
|
|
|
@ -29,12 +29,16 @@ class MultiSequential(paddle.nn.Sequential):
|
||||||
def repeat(N, fn):
|
def repeat(N, fn):
|
||||||
"""Repeat module N times.
|
"""Repeat module N times.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
N (int): Number of repeat time.
|
----------
|
||||||
fn (Callable): Function to generate module.
|
N : int
|
||||||
|
Number of repeat time.
|
||||||
Returns:
|
fn : Callable
|
||||||
MultiSequential: Repeated model instance.
|
Function to generate module.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
MultiSequential
|
||||||
|
Repeated model instance.
|
||||||
"""
|
"""
|
||||||
return MultiSequential(* [fn(n) for n in range(N)])
|
return MultiSequential(* [fn(n) for n in range(N)])
|
||||||
|
|
|
@ -19,10 +19,12 @@ import paddle
|
||||||
class LayerNorm(paddle.nn.LayerNorm):
|
class LayerNorm(paddle.nn.LayerNorm):
|
||||||
"""Layer normalization module.
|
"""Layer normalization module.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
nout (int): Output dim size.
|
----------
|
||||||
dim (int): Dimension to be normalized.
|
nout : int
|
||||||
|
Output dim size.
|
||||||
|
dim : int
|
||||||
|
Dimension to be normalized.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, nout, dim=-1):
|
def __init__(self, nout, dim=-1):
|
||||||
|
@ -33,12 +35,15 @@ class LayerNorm(paddle.nn.LayerNorm):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""Apply layer normalization.
|
"""Apply layer normalization.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
x (torch.Tensor): Input tensor.
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
Returns:
|
Input tensor.
|
||||||
torch.Tensor: Normalized tensor.
|
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Normalized tensor.
|
||||||
"""
|
"""
|
||||||
if self.dim == -1:
|
if self.dim == -1:
|
||||||
return super(LayerNorm, self).forward(x)
|
return super(LayerNorm, self).forward(x)
|
||||||
|
|
|
@ -28,7 +28,7 @@ def is_broadcastable(shp1, shp2):
|
||||||
def masked_fill(xs: paddle.Tensor,
|
def masked_fill(xs: paddle.Tensor,
|
||||||
mask: paddle.Tensor,
|
mask: paddle.Tensor,
|
||||||
value: Union[float, int]):
|
value: Union[float, int]):
|
||||||
# assert is_broadcastable(xs.shape, mask.shape) is True
|
assert is_broadcastable(xs.shape, mask.shape) is True
|
||||||
bshape = paddle.broadcast_shape(xs.shape, mask.shape)
|
bshape = paddle.broadcast_shape(xs.shape, mask.shape)
|
||||||
mask = mask.broadcast_to(bshape)
|
mask = mask.broadcast_to(bshape)
|
||||||
trues = paddle.ones_like(xs) * value
|
trues = paddle.ones_like(xs) * value
|
||||||
|
|
|
@ -13,20 +13,27 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
|
||||||
# 按照这个 batch 里面最长的补零
|
|
||||||
def pad_list(xs, pad_value):
|
def pad_list(xs, pad_value):
|
||||||
"""Perform padding for the list of tensors.
|
"""Perform padding for the list of tensors.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
----------
|
||||||
pad_value (float): Value for padding.
|
xs : List[Tensor]
|
||||||
|
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
||||||
|
pad_value : float)
|
||||||
|
Value for padding.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Padded tensor (B, Tmax, `*`).
|
----------
|
||||||
|
Tensor
|
||||||
|
Padded tensor (B, Tmax, `*`).
|
||||||
|
|
||||||
Examples:
|
Examples
|
||||||
|
----------
|
||||||
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
|
>>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
|
||||||
>>> x
|
>>> x
|
||||||
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
|
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
|
||||||
|
@ -34,11 +41,9 @@ def pad_list(xs, pad_value):
|
||||||
tensor([[1., 1., 1., 1.],
|
tensor([[1., 1., 1., 1.],
|
||||||
[1., 1., 0., 0.],
|
[1., 1., 0., 0.],
|
||||||
[1., 0., 0., 0.]])
|
[1., 0., 0., 0.]])
|
||||||
|
|
||||||
"""
|
"""
|
||||||
n_batch = len(xs)
|
n_batch = len(xs)
|
||||||
max_len = max(x.shape[0] for x in xs)
|
max_len = max(x.shape[0] for x in xs)
|
||||||
# pad = xs[0].new(n_batch, max_len, *xs[0].shape[1:]).fill_(pad_value)
|
|
||||||
pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)
|
pad = paddle.full([n_batch, max_len, *xs[0].shape[1:]], pad_value)
|
||||||
|
|
||||||
for i in range(n_batch):
|
for i in range(n_batch):
|
||||||
|
@ -50,13 +55,18 @@ def pad_list(xs, pad_value):
|
||||||
def make_pad_mask(lengths, length_dim=-1):
|
def make_pad_mask(lengths, length_dim=-1):
|
||||||
"""Make mask tensor containing indices of padded part.
|
"""Make mask tensor containing indices of padded part.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
lengths (LongTensor or List): Batch of lengths (B,).
|
----------
|
||||||
|
lengths : LongTensor or List
|
||||||
|
Batch of lengths (B,).
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
Tensor: Mask tensor containing indices of padded part bool.
|
----------
|
||||||
|
Tensor(bool)
|
||||||
|
Mask tensor containing indices of padded part bool.
|
||||||
|
|
||||||
Examples:
|
Examples
|
||||||
|
----------
|
||||||
With only lengths.
|
With only lengths.
|
||||||
|
|
||||||
>>> lengths = [5, 3, 2]
|
>>> lengths = [5, 3, 2]
|
||||||
|
@ -64,7 +74,6 @@ def make_pad_mask(lengths, length_dim=-1):
|
||||||
masks = [[0, 0, 0, 0 ,0],
|
masks = [[0, 0, 0, 0 ,0],
|
||||||
[0, 0, 0, 1, 1],
|
[0, 0, 0, 1, 1],
|
||||||
[0, 0, 1, 1, 1]]
|
[0, 0, 1, 1, 1]]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if length_dim == 0:
|
if length_dim == 0:
|
||||||
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
|
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
|
||||||
|
@ -88,17 +97,24 @@ def make_pad_mask(lengths, length_dim=-1):
|
||||||
def make_non_pad_mask(lengths, length_dim=-1):
|
def make_non_pad_mask(lengths, length_dim=-1):
|
||||||
"""Make mask tensor containing indices of non-padded part.
|
"""Make mask tensor containing indices of non-padded part.
|
||||||
|
|
||||||
Args:
|
Parameters
|
||||||
lengths (LongTensor or List): Batch of lengths (B,).
|
----------
|
||||||
xs (Tensor, optional): The reference tensor.
|
lengths : LongTensor or List
|
||||||
|
Batch of lengths (B,).
|
||||||
|
xs : Tensor, optional
|
||||||
|
The reference tensor.
|
||||||
If set, masks will be the same shape as this tensor.
|
If set, masks will be the same shape as this tensor.
|
||||||
length_dim (int, optional): Dimension indicator of the above tensor.
|
length_dim : int, optional
|
||||||
|
Dimension indicator of the above tensor.
|
||||||
See the example.
|
See the example.
|
||||||
|
|
||||||
Returns:
|
Returns
|
||||||
ByteTensor: mask tensor containing indices of padded part bool.
|
----------
|
||||||
|
Tensor(bool)
|
||||||
|
mask tensor containing indices of padded part bool.
|
||||||
|
|
||||||
Examples:
|
Examples
|
||||||
|
----------
|
||||||
With only lengths.
|
With only lengths.
|
||||||
|
|
||||||
>>> lengths = [5, 3, 2]
|
>>> lengths = [5, 3, 2]
|
||||||
|
@ -106,6 +122,37 @@ def make_non_pad_mask(lengths, length_dim=-1):
|
||||||
masks = [[1, 1, 1, 1 ,1],
|
masks = [[1, 1, 1, 1 ,1],
|
||||||
[1, 1, 1, 0, 0],
|
[1, 1, 1, 0, 0],
|
||||||
[1, 1, 0, 0, 0]]
|
[1, 1, 0, 0, 0]]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return paddle.logical_not(make_pad_mask(lengths, length_dim))
|
return paddle.logical_not(make_pad_mask(lengths, length_dim))
|
||||||
|
|
||||||
|
|
||||||
|
def initialize(model: nn.Layer, init: str):
|
||||||
|
"""Initialize weights of a neural network module.
|
||||||
|
|
||||||
|
Parameters are initialized using the given method or distribution.
|
||||||
|
|
||||||
|
Custom initialization routines can be implemented into submodules
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model : paddle.nn.Layer
|
||||||
|
Target.
|
||||||
|
init : str
|
||||||
|
Method of initialization.
|
||||||
|
"""
|
||||||
|
assert check_argument_types()
|
||||||
|
|
||||||
|
if init == "xavier_uniform":
|
||||||
|
nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
|
||||||
|
nn.initializer.Constant())
|
||||||
|
elif init == "xavier_normal":
|
||||||
|
nn.initializer.set_global_initializer(nn.initializer.XavierNormal(),
|
||||||
|
nn.initializer.Constant())
|
||||||
|
elif init == "kaiming_uniform":
|
||||||
|
nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(),
|
||||||
|
nn.initializer.Constant())
|
||||||
|
elif init == "kaiming_normal":
|
||||||
|
nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(),
|
||||||
|
nn.initializer.Constant())
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown initialization: " + init)
|
||||||
|
|
Loading…
Reference in New Issue