Merge pull request #60 from lfchener/doc

add docstring for LocationSensitiveAttention
This commit is contained in:
Feiyu Chan 2020-12-18 20:38:09 +08:00 committed by GitHub
commit d08eb72791
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 413 additions and 154 deletions

View File

@ -19,6 +19,8 @@ from parakeet.frontend.normalizer.numbers import normalize_numbers
def normalize(sentence): def normalize(sentence):
""" Normalize English text.
"""
# preprocessing # preprocessing
sentence = unicode(sentence) sentence = unicode(sentence)
sentence = normalize_numbers(sentence) sentence = normalize_numbers(sentence)

View File

@ -75,6 +75,8 @@ def _expand_number(m):
def normalize_numbers(text): def normalize_numbers(text):
""" Normalize numbers in English text.
"""
text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text) text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text) text = re.sub(_dollars_re, _expand_dollars, text)

View File

@ -39,6 +39,9 @@ class Phonetics(ABC):
class English(Phonetics): class English(Phonetics):
""" Normalize the input text sequence and convert into pronunciation id sequence.
"""
def __init__(self): def __init__(self):
self.backend = G2p() self.backend = G2p()
self.phonemes = list(self.backend.phonemes) self.phonemes = list(self.backend.phonemes)
@ -46,6 +49,18 @@ class English(Phonetics):
self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab = Vocab(self.phonemes + self.punctuations)
def phoneticize(self, sentence): def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
start = self.vocab.start_symbol start = self.vocab.start_symbol
end = self.vocab.end_symbol end = self.vocab.end_symbol
phonemes = ([] if start is None else [start]) \ phonemes = ([] if start is None else [start]) \
@ -54,6 +69,18 @@ class English(Phonetics):
return phonemes return phonemes
def numericalize(self, phonemes): def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
The list of pronunciation sequence.
Returns
----------
List[int]
The list of pronunciation id sequence.
"""
ids = [ ids = [
self.vocab.lookup(item) for item in phonemes self.vocab.lookup(item) for item in phonemes
if item in self.vocab.stoi if item in self.vocab.stoi
@ -61,17 +88,46 @@ class English(Phonetics):
return ids return ids
def reverse(self, ids): def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters
-----------
ids: List[int]
The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence): def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence)) return self.numericalize(self.phoneticize(sentence))
@property @property
def vocab_size(self): def vocab_size(self):
""" Vocab size.
"""
return len(self.vocab) return len(self.vocab)
class EnglishCharacter(Phonetics): class EnglishCharacter(Phonetics):
""" Normalize the input text sequence and convert it into character id sequence.
"""
def __init__(self): def __init__(self):
self.backend = G2p() self.backend = G2p()
self.graphemes = list(self.backend.graphemes) self.graphemes = list(self.backend.graphemes)
@ -79,10 +135,34 @@ class EnglishCharacter(Phonetics):
self.vocab = Vocab(self.graphemes + self.punctuations) self.vocab = Vocab(self.graphemes + self.punctuations)
def phoneticize(self, sentence): def phoneticize(self, sentence):
""" Normalize the input text sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
str
A text sequence after normalize.
"""
words = normalize(sentence) words = normalize(sentence)
return words return words
def numericalize(self, sentence): def numericalize(self, sentence):
""" Convert a text sequence into ids.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[int]
List of a character id sequence.
"""
ids = [ ids = [
self.vocab.lookup(item) for item in sentence self.vocab.lookup(item) for item in sentence
if item in self.vocab.stoi if item in self.vocab.stoi
@ -90,17 +170,46 @@ class EnglishCharacter(Phonetics):
return ids return ids
def reverse(self, ids): def reverse(self, ids):
""" Convert a character id sequence into text.
Parameters
-----------
ids: List[int]
List of a character id sequence.
Returns
----------
str
The input text sequence.
"""
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence): def __call__(self, sentence):
""" Normalize the input text sequence and convert it into character id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[int]
List of a character id sequence.
"""
return self.numericalize(self.phoneticize(sentence)) return self.numericalize(self.phoneticize(sentence))
@property @property
def vocab_size(self): def vocab_size(self):
""" Vocab size.
"""
return len(self.vocab) return len(self.vocab)
class Chinese(Phonetics): class Chinese(Phonetics):
"""Normalize Chinese text sequence and convert it into ids.
"""
def __init__(self): def __init__(self):
self.opencc_backend = OpenCC('t2s.json') self.opencc_backend = OpenCC('t2s.json')
self.backend = G2pM() self.backend = G2pM()
@ -115,6 +224,18 @@ class Chinese(Phonetics):
return list(all_syllables) return list(all_syllables)
def phoneticize(self, sentence): def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
simplified = self.opencc_backend.convert(sentence) simplified = self.opencc_backend.convert(sentence)
phonemes = self.backend(simplified) phonemes = self.backend(simplified)
start = self.vocab.start_symbol start = self.vocab.start_symbol
@ -136,15 +257,53 @@ class Chinese(Phonetics):
return cleaned_phonemes return cleaned_phonemes
def numericalize(self, phonemes): def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
Parameters
-----------
phonemes: List[str]
The list of pronunciation sequence.
Returns
----------
List[int]
The list of pronunciation id sequence.
"""
ids = [self.vocab.lookup(item) for item in phonemes] ids = [self.vocab.lookup(item) for item in phonemes]
return ids return ids
def __call__(self, sentence): def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence.
Parameters
-----------
sentence: str
The input text sequence.
Returns
----------
List[str]
The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence)) return self.numericalize(self.phoneticize(sentence))
@property @property
def vocab_size(self): def vocab_size(self):
""" Vocab size.
"""
return len(self.vocab) return len(self.vocab)
def reverse(self, ids): def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
Parameters
-----------
ids: List[int]
The list of pronunciation id sequence.
Returns
----------
List[str]
The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids] return [self.vocab.reverse(i) for i in ids]

View File

@ -1,32 +1,64 @@
from typing import Dict, Iterable, List # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
from ruamel import yaml #
from collections import OrderedDict # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Iterable, List
from collections import OrderedDict
__all__ = ["Vocab"] __all__ = ["Vocab"]
class Vocab(object): class Vocab(object):
def __init__(self, symbols: Iterable[str], """ Vocabulary.
padding_symbol="<pad>",
unk_symbol="<unk>", Parameters
start_symbol="<s>", -----------
end_symbol="</s>"): symbols: Iterable[str]
Common symbols.
padding_symbol: str, optional
Symbol for pad. Defaults to "<pad>".
unk_symbol: str, optional
Symbol for unknow. Defaults to "<unk>"
start_symbol: str, optional
Symbol for start. Defaults to "<s>"
end_symbol: str, optional
Symbol for end. Defaults to "</s>"
"""
def __init__(self,
symbols: Iterable[str],
padding_symbol="<pad>",
unk_symbol="<unk>",
start_symbol="<s>",
end_symbol="</s>"):
self.special_symbols = OrderedDict() self.special_symbols = OrderedDict()
for i, item in enumerate( for i, item in enumerate(
[padding_symbol, unk_symbol, start_symbol, end_symbol]): [padding_symbol, unk_symbol, start_symbol, end_symbol]):
if item: if item:
self.special_symbols[item] = len(self.special_symbols) self.special_symbols[item] = len(self.special_symbols)
self.padding_symbol = padding_symbol self.padding_symbol = padding_symbol
self.unk_symbol = unk_symbol self.unk_symbol = unk_symbol
self.start_symbol = start_symbol self.start_symbol = start_symbol
self.end_symbol = end_symbol self.end_symbol = end_symbol
self.stoi = OrderedDict() self.stoi = OrderedDict()
self.stoi.update(self.special_symbols) self.stoi.update(self.special_symbols)
for i, s in enumerate(symbols): for i, s in enumerate(symbols):
if s not in self.stoi: if s not in self.stoi:
self.stoi[s] = len(self.stoi) self.stoi[s] = len(self.stoi)
@ -34,49 +66,66 @@ class Vocab(object):
def __len__(self): def __len__(self):
return len(self.stoi) return len(self.stoi)
@property @property
def num_specials(self): def num_specials(self):
""" The number of special symbols.
"""
return len(self.special_symbols) return len(self.special_symbols)
# special tokens # special tokens
@property @property
def padding_index(self): def padding_index(self):
""" The index of padding symbol
"""
return self.stoi.get(self.padding_symbol, -1) return self.stoi.get(self.padding_symbol, -1)
@property @property
def unk_index(self): def unk_index(self):
"""The index of unknow symbol.
"""
return self.stoi.get(self.unk_symbol, -1) return self.stoi.get(self.unk_symbol, -1)
@property @property
def start_index(self): def start_index(self):
"""The index of start symbol.
"""
return self.stoi.get(self.start_symbol, -1) return self.stoi.get(self.start_symbol, -1)
@property @property
def end_index(self): def end_index(self):
""" The index of end symbol.
"""
return self.stoi.get(self.end_symbol, -1) return self.stoi.get(self.end_symbol, -1)
def __repr__(self): def __repr__(self):
fmt = "Vocab(size: {},\nstoi:\n{})" fmt = "Vocab(size: {},\nstoi:\n{})"
return fmt.format(len(self), self.stoi) return fmt.format(len(self), self.stoi)
def __str__(self): def __str__(self):
return self.__repr__() return self.__repr__()
def lookup(self, symbol): def lookup(self, symbol):
""" The index that symbol correspond.
"""
return self.stoi[symbol] return self.stoi[symbol]
def reverse(self, index): def reverse(self, index):
""" The symbol thar index cottespond.
"""
return self.itos[index] return self.itos[index]
def add_symbol(self, symbol): def add_symbol(self, symbol):
""" Add a new symbol in vocab.
"""
if symbol in self.stoi: if symbol in self.stoi:
return return
N = len(self.stoi) N = len(self.stoi)
self.stoi[symbol] = N self.stoi[symbol] = N
self.itos[N] = symbol self.itos[N] = symbol
def add_symbols(self, symbols): def add_symbols(self, symbols):
""" Add multiple symbols in vocab.
"""
for symbol in symbols: for symbol in symbols:
self.add_symbol(symbol) self.add_symbol(symbol)

View File

@ -32,16 +32,16 @@ class DecoderPreNet(nn.Layer):
Parameters Parameters
---------- ----------
d_input: int d_input: int
input feature size The input feature size.
d_hidden: int d_hidden: int
hidden size The hidden size.
d_output: int d_output: int
output feature size The output feature size.
dropout_rate: float dropout_rate: float
droput probability The droput probability.
""" """
@ -49,7 +49,7 @@ class DecoderPreNet(nn.Layer):
d_input: int, d_input: int,
d_hidden: int, d_hidden: int,
d_output: int, d_output: int,
dropout_rate: float=0.2): dropout_rate: float):
super().__init__() super().__init__()
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
@ -62,12 +62,12 @@ class DecoderPreNet(nn.Layer):
Parameters Parameters
---------- ----------
x: Tensor [shape=(B, T_mel, C)] x: Tensor [shape=(B, T_mel, C)]
batch of the sequences of padded mel spectrogram Batch of the sequences of padded mel spectrogram.
Returns Returns
------- -------
output: Tensor [shape=(B, T_mel, C)] output: Tensor [shape=(B, T_mel, C)]
batch of the sequences of padded hidden state Batch of the sequences of padded hidden state.
""" """
@ -82,28 +82,28 @@ class DecoderPostNet(nn.Layer):
Parameters Parameters
---------- ----------
d_mels: int d_mels: int
number of mel bands The number of mel bands.
d_hidden: int d_hidden: int
hidden size of postnet The hidden size of postnet.
kernel_size: int kernel_size: int
kernel size of the conv layer in postnet The kernel size of the conv layer in postnet.
num_layers: int num_layers: int
number of conv layers in postnet The number of conv layers in postnet.
dropout: float dropout: float
droput probability The droput probability.
""" """
def __init__(self, def __init__(self,
d_mels: int=80, d_mels: int,
d_hidden: int=512, d_hidden: int,
kernel_size: int=5, kernel_size: int,
num_layers: int=5, num_layers: int,
dropout: float=0.1): dropout: float):
super().__init__() super().__init__()
self.dropout = dropout self.dropout = dropout
self.num_layers = num_layers self.num_layers = num_layers
@ -150,12 +150,12 @@ class DecoderPostNet(nn.Layer):
Parameters Parameters
---------- ----------
input: Tensor [shape=(B, T_mel, C)] input: Tensor [shape=(B, T_mel, C)]
output sequence of features from decoder Output sequence of features from decoder.
Returns Returns
------- -------
output: Tensor [shape=(B, T_mel, C)] output: Tensor [shape=(B, T_mel, C)]
output sequence of features after postnet Output sequence of features after postnet.
""" """
@ -173,16 +173,16 @@ class Tacotron2Encoder(nn.Layer):
Parameters Parameters
---------- ----------
d_hidden: int d_hidden: int
hidden size in encoder module The hidden size in encoder module.
conv_layers: int conv_layers: int
number of conv layers The number of conv layers.
kernel_size: int kernel_size: int
kernel size of conv layers The kernel size of conv layers.
p_dropout: float p_dropout: float
droput probability The droput probability.
""" """
def __init__(self, def __init__(self,
@ -216,15 +216,15 @@ class Tacotron2Encoder(nn.Layer):
Parameters Parameters
---------- ----------
x: Tensor [shape=(B, T)] x: Tensor [shape=(B, T)]
batch of the sequencees of padded character ids Batch of the sequencees of padded character ids.
text_lens: Tensor [shape=(B,)] text_lens: Tensor [shape=(B,)], optional
batch of lengths of each text input batch. Batch of lengths of each text input batch. Defaults to None.
Returns Returns
------- -------
output : Tensor [shape=(B, T, C)] output : Tensor [shape=(B, T, C)]
batch of the sequences of padded hidden states Batch of the sequences of padded hidden states.
""" """
for conv_batchnorm in self.conv_batchnorms: for conv_batchnorm in self.conv_batchnorms:
@ -241,40 +241,40 @@ class Tacotron2Decoder(nn.Layer):
Parameters Parameters
---------- ----------
d_mels: int d_mels: int
number of mel bands The number of mel bands.
reduction_factor: int reduction_factor: int
reduction factor of tacotron The reduction factor of tacotron.
d_encoder: int d_encoder: int
hidden size of encoder The hidden size of encoder.
d_prenet: int d_prenet: int
hidden size in decoder prenet The hidden size in decoder prenet.
d_attention_rnn: int d_attention_rnn: int
attention rnn layer hidden size The attention rnn layer hidden size.
d_decoder_rnn: int d_decoder_rnn: int
decoder rnn layer hidden size The decoder rnn layer hidden size.
d_attention: int d_attention: int
hidden size of the linear layer in location sensitive attention The hidden size of the linear layer in location sensitive attention.
attention_filters: int attention_filters: int
filter size of the conv layer in location sensitive attention The filter size of the conv layer in location sensitive attention.
attention_kernel_size: int attention_kernel_size: int
kernel size of the conv layer in location sensitive attention The kernel size of the conv layer in location sensitive attention.
p_prenet_dropout: float p_prenet_dropout: float
droput probability in decoder prenet The droput probability in decoder prenet.
p_attention_dropout: float p_attention_dropout: float
droput probability in location sensitive attention The droput probability in location sensitive attention.
p_decoder_dropout: float p_decoder_dropout: float
droput probability in decoder The droput probability in decoder.
""" """
def __init__(self, def __init__(self,
@ -382,25 +382,25 @@ class Tacotron2Decoder(nn.Layer):
Parameters Parameters
---------- ----------
keys: Tensor[shape=(B, T_text, C)] keys: Tensor[shape=(B, T_key, C)]
batch of the sequences of padded output from encoder Batch of the sequences of padded output from encoder.
querys: Tensor[shape(B, T_mel, C)] querys: Tensor[shape(B, T_query, C)]
batch of the sequences of padded mel spectrogram Batch of the sequences of padded mel spectrogram.
mask: Tensor[shape=(B, T_text, 1)] mask: Tensor
mask generated with text length Mask generated with text length. Shape should be (B, T_key, T_query) or broadcastable shape.
Returns Returns
------- -------
mel_output: Tensor [shape=(B, T_mel, C)] mel_output: Tensor [shape=(B, T_query, C)]
output sequence of features Output sequence of features.
stop_logits: Tensor [shape=(B, T_mel)] stop_logits: Tensor [shape=(B, T_query)]
output sequence of stop logits Output sequence of stop logits.
alignments: Tensor [shape=(B, T_mel, T_text)] alignments: Tensor [shape=(B, T_query, T_key)]
attention weights Attention weights.
""" """
querys = paddle.reshape( querys = paddle.reshape(
querys, querys,
@ -437,25 +437,25 @@ class Tacotron2Decoder(nn.Layer):
Parameters Parameters
---------- ----------
keys: Tensor [shape=(B, T_text, C)] keys: Tensor [shape=(B, T_key, C)]
batch of the sequences of padded output from encoder Batch of the sequences of padded output from encoder.
stop_threshold: float stop_threshold: float, optional
stop synthesize when stop logit is greater than this stop threshold Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
max_decoder_steps: int max_decoder_steps: int, optional
number of max step when synthesize Number of max step when synthesize. Defaults to 1000.
Returns Returns
------- -------
mel_output: Tensor [shape=(B, T_mel, C)] mel_output: Tensor [shape=(B, T_mel, C)]
output sequence of features Output sequence of features.
stop_logits: Tensor [shape=(B, T_mel)] stop_logits: Tensor [shape=(B, T_mel)]
output sequence of stop logits Output sequence of stop logits.
alignments: Tensor [shape=(B, T_mel, T_text)] alignments: Tensor [shape=(B, T_mel, T_key)]
attention weights Attention weights.
""" """
query = paddle.zeros( query = paddle.zeros(
@ -493,75 +493,72 @@ class Tacotron2(nn.Layer):
"""Tacotron2 model for end-to-end text-to-speech (E2E-TTS). """Tacotron2 model for end-to-end text-to-speech (E2E-TTS).
This is a model of Spectrogram prediction network in Tacotron2 described This is a model of Spectrogram prediction network in Tacotron2 described
in ``Natural TTS Synthesis in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions
by Conditioning WaveNet on Mel Spectrogram Predictions``, <https://arxiv.org/abs/1712.05884>`_,
which converts the sequence of characters which converts the sequence of characters
into the sequence of mel spectrogram. into the sequence of mel spectrogram.
`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions
<https://arxiv.org/abs/1712.05884>`_.
Parameters Parameters
---------- ----------
frontend : parakeet.frontend.Phonetics frontend : parakeet.frontend.Phonetics
frontend used to preprocess text Frontend used to preprocess text.
d_mels: int d_mels: int
number of mel bands Number of mel bands.
d_encoder: int d_encoder: int
hidden size in encoder module Hidden size in encoder module.
encoder_conv_layers: int encoder_conv_layers: int
number of conv layers in encoder Number of conv layers in encoder.
encoder_kernel_size: int encoder_kernel_size: int
kernel size of conv layers in encoder Kernel size of conv layers in encoder.
d_prenet: int d_prenet: int
hidden size in decoder prenet Hidden size in decoder prenet.
d_attention_rnn: int d_attention_rnn: int
attention rnn layer hidden size in decoder Attention rnn layer hidden size in decoder.
d_decoder_rnn: int d_decoder_rnn: int
decoder rnn layer hidden size in decoder Decoder rnn layer hidden size in decoder.
attention_filters: int attention_filters: int
filter size of the conv layer in location sensitive attention Filter size of the conv layer in location sensitive attention.
attention_kernel_size: int attention_kernel_size: int
kernel size of the conv layer in location sensitive attention Kernel size of the conv layer in location sensitive attention.
d_attention: int d_attention: int
hidden size of the linear layer in location sensitive attention Hidden size of the linear layer in location sensitive attention.
d_postnet: int d_postnet: int
hidden size of postnet Hidden size of postnet.
postnet_kernel_size: int postnet_kernel_size: int
kernel size of the conv layer in postnet Kernel size of the conv layer in postnet.
postnet_conv_layers: int postnet_conv_layers: int
number of conv layers in postnet Number of conv layers in postnet.
reduction_factor: int reduction_factor: int
reduction factor of tacotron Reduction factor of tacotron2.
p_encoder_dropout: float p_encoder_dropout: float
droput probability in encoder Droput probability in encoder.
p_prenet_dropout: float p_prenet_dropout: float
droput probability in decoder prenet Droput probability in decoder prenet.
p_attention_dropout: float p_attention_dropout: float
droput probability in location sensitive attention Droput probability in location sensitive attention.
p_decoder_dropout: float p_decoder_dropout: float
droput probability in decoder Droput probability in decoder.
p_postnet_dropout: float p_postnet_dropout: float
droput probability in postnet Droput probability in postnet.
""" """
@ -616,28 +613,28 @@ class Tacotron2(nn.Layer):
Parameters Parameters
---------- ----------
text_inputs: Tensor [shape=(B, T_text)] text_inputs: Tensor [shape=(B, T_text)]
batch of the sequencees of padded character ids Batch of the sequencees of padded character ids.
mels: Tensor [shape(B, T_mel, C)] mels: Tensor [shape(B, T_mel, C)]
batch of the sequences of padded mel spectrogram Batch of the sequences of padded mel spectrogram.
text_lens: Tensor [shape=(B,)] text_lens: Tensor [shape=(B,)]
batch of lengths of each text input batch. Batch of lengths of each text input batch.
output_lens: Tensor [shape=(B,)] output_lens: Tensor [shape=(B,)], optional
batch of lengths of each mels batch. Batch of lengths of each mels batch. Defaults to None.
Returns Returns
------- -------
outputs : Dict[str, Tensor] outputs : Dict[str, Tensor]
mel_output: output sequence of features (B, T_mel, C) mel_output: output sequence of features (B, T_mel, C);
mel_outputs_postnet: output sequence of features after postnet (B, T_mel, C) mel_outputs_postnet: output sequence of features after postnet (B, T_mel, C);
stop_logits: output sequence of stop logits (B, T_mel) stop_logits: output sequence of stop logits (B, T_mel);
alignments: attention weights (B, T_mel, T_text) alignments: attention weights (B, T_mel, T_text).
""" """
embedded_inputs = self.embedding(text_inputs) embedded_inputs = self.embedding(text_inputs)
encoder_outputs = self.encoder(embedded_inputs, text_lens) encoder_outputs = self.encoder(embedded_inputs, text_lens)
@ -675,25 +672,25 @@ class Tacotron2(nn.Layer):
Parameters Parameters
---------- ----------
text_inputs: Tensor [shape=(B, T_text)] text_inputs: Tensor [shape=(B, T_text)]
batch of the sequencees of padded character ids Batch of the sequencees of padded character ids.
stop_threshold: float stop_threshold: float, optional
stop synthesize when stop logit is greater than this stop threshold Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
max_decoder_steps: int max_decoder_steps: int, optional
number of max step when synthesize Number of max step when synthesize. Defaults to 1000.
Returns Returns
------- -------
outputs : Dict[str, Tensor] outputs : Dict[str, Tensor]
mel_output: output sequence of sepctrogram (B, T_mel, C) mel_output: output sequence of sepctrogram (B, T_mel, C);
mel_outputs_postnet: output sequence of sepctrogram after postnet (B, T_mel, C) mel_outputs_postnet: output sequence of sepctrogram after postnet (B, T_mel, C);
stop_logits: output sequence of stop logits (B, T_mel) stop_logits: output sequence of stop logits (B, T_mel);
alignments: attention weights (B, T_mel, T_text) alignments: attention weights (B, T_mel, T_text).
""" """
embedded_inputs = self.embedding(text_inputs) embedded_inputs = self.embedding(text_inputs)
encoder_outputs = self.encoder(embedded_inputs) encoder_outputs = self.encoder(embedded_inputs)
@ -721,21 +718,21 @@ class Tacotron2(nn.Layer):
Parameters Parameters
---------- ----------
text: str text: str
sequence of characters Sequence of characters.
stop_threshold: float stop_threshold: float, optional
stop synthesize when stop logit is greater than this stop threshold Stop synthesize when stop logit is greater than this stop threshold. Defaults to 0.5.
max_decoder_steps: int max_decoder_steps: int, optional
number of max step when synthesize Number of max step when synthesize. Defaults to 1000.
Returns Returns
------- -------
outputs : Dict[str, Tensor] outputs : Dict[str, Tensor]
mel_outputs_postnet: output sequence of sepctrogram after postnet (T_mel, C) mel_outputs_postnet: output sequence of sepctrogram after postnet (T_mel, C);
alignments: attention weights (T_mel, T_text) alignments: attention weights (T_mel, T_text).
""" """
ids = np.asarray(self.frontend(text)) ids = np.asarray(self.frontend(text))
ids = paddle.unsqueeze(paddle.to_tensor(ids, dtype='int64'), [0]) ids = paddle.unsqueeze(paddle.to_tensor(ids, dtype='int64'), [0])
@ -750,21 +747,18 @@ class Tacotron2(nn.Layer):
Parameters Parameters
---------- ----------
frontend: parakeet.frontend.Phonetics frontend: parakeet.frontend.Phonetics
frontend used to preprocess text Frontend used to preprocess text.
config: yacs.config.CfgNode config: yacs.config.CfgNode
model configs Model configs.
checkpoint_path: Path checkpoint_path: Path or str
the path of pretrained model checkpoint The path of pretrained model checkpoint, without extension name.
Returns Returns
------- -------
mel_outputs_postnet: Tensor [shape=(T_mel, C)] Tacotron2
output sequence of sepctrogram after postnet The model build from pretrined result.
alignments: Tensor [shape=(T_mel, T_text)]
attention weights
""" """
model = cls(frontend, model = cls(frontend,
d_mels=config.data.d_mels, d_mels=config.data.d_mels,
@ -805,31 +799,31 @@ class Tacotron2Loss(nn.Layer):
Parameters Parameters
---------- ----------
mel_outputs: Tensor [shape=(B, T_mel, C)] mel_outputs: Tensor [shape=(B, T_mel, C)]
output mel spectrogram sequence Output mel spectrogram sequence.
mel_outputs_postnet: Tensor [shape(B, T_mel, C)] mel_outputs_postnet: Tensor [shape(B, T_mel, C)]
output mel spectrogram sequence after postnet Output mel spectrogram sequence after postnet.
stop_logits: Tensor [shape=(B, T_mel)] stop_logits: Tensor [shape=(B, T_mel)]
output sequence of stop logits befor sigmoid Output sequence of stop logits befor sigmoid.
mel_targets: Tensor [shape=(B, T_mel, C)] mel_targets: Tensor [shape=(B, T_mel, C)]
target mel spectrogram sequence Target mel spectrogram sequence.
stop_tokens: Tensor [shape=(B,)] stop_tokens: Tensor [shape=(B,)]
target stop token Target stop token.
Returns Returns
------- -------
losses : Dict[str, Tensor] losses : Dict[str, Tensor]
loss: the sum of the other three losses loss: the sum of the other three losses;
mel_loss: MSE loss compute by mel_targets and mel_outputs mel_loss: MSE loss compute by mel_targets and mel_outputs;
post_mel_loss: MSE loss compute by mel_targets and mel_outputs_postnet post_mel_loss: MSE loss compute by mel_targets and mel_outputs_postnet;
stop_loss: stop loss computed by stop_logits and stop token stop_loss: stop loss computed by stop_logits and stop token.
""" """
mel_loss = paddle.nn.MSELoss()(mel_outputs, mel_targets) mel_loss = paddle.nn.MSELoss()(mel_outputs, mel_targets)
post_mel_loss = paddle.nn.MSELoss()(mel_outputs_postnet, mel_targets) post_mel_loss = paddle.nn.MSELoss()(mel_outputs_postnet, mel_targets)

View File

@ -18,6 +18,7 @@ import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
def scaled_dot_product_attention(q, def scaled_dot_product_attention(q,
k, k,
v, v,
@ -139,10 +140,11 @@ class MonoheadAttention(nn.Layer):
Feature size of the key of each scaled dot product attention. If not Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None. provided, it is set to `model_dim / num_heads`. Defaults to None.
""" """
def __init__(self,
model_dim: int, def __init__(self,
dropout: float=0.0, model_dim: int,
k_dim: int=None, dropout: float=0.0,
k_dim: int=None,
v_dim: int=None): v_dim: int=None):
super(MonoheadAttention, self).__init__() super(MonoheadAttention, self).__init__()
k_dim = k_dim or model_dim k_dim = k_dim or model_dim
@ -219,6 +221,7 @@ class MultiheadAttention(nn.Layer):
ValueError ValueError
If ``model_dim`` is not divisible by ``num_heads``. If ``model_dim`` is not divisible by ``num_heads``.
""" """
def __init__(self, def __init__(self,
model_dim: int, model_dim: int,
num_heads: int, num_heads: int,
@ -279,6 +282,28 @@ class MultiheadAttention(nn.Layer):
class LocationSensitiveAttention(nn.Layer): class LocationSensitiveAttention(nn.Layer):
"""Location Sensitive Attention module.
Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
Parameters
-----------
d_query: int
The feature size of query.
d_key : int
The feature size of key.
d_attention : int
The feature size of dimension.
location_filters : int
Filter size of attention convolution.
location_kernel_size : int
Kernel size of attention convolution.
"""
def __init__(self, def __init__(self,
d_query: int, d_query: int,
d_key: int, d_key: int,
@ -310,6 +335,34 @@ class LocationSensitiveAttention(nn.Layer):
value, value,
attention_weights_cat, attention_weights_cat,
mask=None): mask=None):
"""Compute context vector and attention weights.
Parameters
-----------
query : Tensor [shape=(batch_size, d_query)]
The queries.
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
The keys after linear layer.
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
The values.
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
Attention weights concat.
mask : Tensor, optional
The mask. Shape should be (batch_size, times_steps_q, time_steps_k) or broadcastable shape.
Defaults to None.
Returns
----------
attention_context : Tensor [shape=(batch_size, time_steps_q, d_attention)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1])) processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
processed_attention_weights = self.location_layer( processed_attention_weights = self.location_layer(