From c43216ae9bbf367f1b407bc48d44503819bba114 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 22 Oct 2020 05:04:45 +0000 Subject: [PATCH] 1. API renaming Conv1d -> Conv1D, BatchNorm1d -> BatchNorm1D; 2. add losses in parakeet/modules; 3. fix a bug in phonetics; 4. TransformerTTS update: encoder dim can be different from decoder dim; 5. MultiHeadAttention in TransformerTTS: add k_input_dim & v_input_dim in __init__ to allow differemt feature sizes for k and v. --- .gitignore | 3 + parakeet/audio/__init__.py | 3 +- parakeet/audio/audio.py | 322 ++++++----------------------- parakeet/audio/spec_normalizer.py | 12 +- parakeet/data/datasets.py | 26 ++- parakeet/frontend/phonectic.py | 12 +- parakeet/frontend/vocab.py | 3 +- parakeet/models/deepvoice3.py | 2 +- parakeet/models/transformer_tts.py | 77 ++++--- parakeet/modules/cbhg.py | 2 +- parakeet/modules/conv.py | 6 +- parakeet/modules/losses.py | 31 +++ tests/test_losses.py | 33 +++ tests/test_transformer_tts.py | 3 +- 14 files changed, 230 insertions(+), 305 deletions(-) create mode 100644 parakeet/modules/losses.py create mode 100644 tests/test_losses.py diff --git a/.gitignore b/.gitignore index 7906666..25f2656 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ *.udb *.ann +# data +datasets/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py index 253a887..7fc437c 100644 --- a/parakeet/audio/__init__.py +++ b/parakeet/audio/__init__.py @@ -12,4 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .audio import AudioProcessor \ No newline at end of file +from .audio import AudioProcessor +from .spec_normalizer import NormalizerBase, LogMagnitude \ No newline at end of file diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index 9133a47..48722da 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -15,278 +15,80 @@ import librosa import soundfile as sf import numpy as np -import scipy.io -import scipy.signal - class AudioProcessor(object): - def __init__( - self, - sample_rate=None, # int, sampling rate - num_mels=None, # int, bands of mel spectrogram - min_level_db=None, # float, minimum level db - ref_level_db=None, # float, reference level db - n_fft=None, # int: number of samples in a frame for stft - win_length=None, # int: the same meaning with n_fft - hop_length=None, # int: number of samples between neighboring frame - power=None, # float:power to raise before griffin-lim - preemphasis=None, # float: preemphasis coefficident - signal_norm=None, # - symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] - max_norm=None, # float, max norm - mel_fmin=None, # int: mel spectrogram's minimum frequency - mel_fmax=None, # int: mel spectrogram's maximum frequency - clip_norm=True, # bool: clip spectrogram's norm - griffin_lim_iters=None, # int: - do_trim_silence=False, # bool: trim silence - sound_norm=False, - **kwargs): + def __init__(self, + sample_rate:int, + n_fft:int, + win_length:int, + hop_length:int, + n_mels:int=80, + f_min:int=0, + f_max:int=None, + window="hann", + center="True", + pad_mode="reflect"): + # read & write self.sample_rate = sample_rate - self.num_mels = num_mels - self.min_level_db = min_level_db - self.ref_level_db = ref_level_db - # stft related + # stft self.n_fft = n_fft - self.win_length = win_length or n_fft - # hop length defaults to 1/4 window_length - self.hop_length = hop_length or 0.25 * self.win_length + self.win_length = win_length + self.hop_length = hop_length + self.window = window + self.center = center + self.pad_mode = pad_mode + + # mel + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max - self.power = power - self.preemphasis = float(preemphasis) + self.mel_filter = self._create_mel_filter() + self.inv_mel_filter = np.linalg.pinv(self.mel_filter) + + def _create_mel_filter(self): + mel_filter = librosa.filters.mel( + self.sample_rate, + self.n_fft, + n_mels=self.n_mels, + fmin=self.f_min, + fmax=self.f_max) + return mel_filter - self.griffin_lim_iters = griffin_lim_iters - self.signal_norm = signal_norm - self.symmetric_norm = symmetric_norm + def read_wav(self, filename): + # resampling may occur + wav, _ = librosa.load(filename, sr=self.sample_rate) + return wav - # mel transform related - self.mel_fmin = mel_fmin - self.mel_fmax = mel_fmax + def write_wav(self, path, wav): + sf.write(path, wav, samplerate=self.sample_rate) - self.max_norm = 1.0 if max_norm is None else float(max_norm) - self.clip_norm = clip_norm - self.do_trim_silence = do_trim_silence - - self.sound_norm = sound_norm - self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters( - ) - - def _stft_parameters(self): - """compute frame length and hop length in ms""" - frame_length_ms = self.win_length * 1. / self.sample_rate - frame_shift_ms = self.hop_length * 1. / self.sample_rate - num_freq = 1 + self.n_fft // 2 - return num_freq, frame_length_ms, frame_shift_ms - - def __repr__(self): - """object repr""" - cls_name_str = self.__class__.__name__ - members = vars(self) - dict_str = "\n".join( - [" {}: {},".format(k, v) for k, v in members.items()]) - repr_str = "{}(\n{})\n".format(cls_name_str, dict_str) - return repr_str - - def save_wav(self, path, wav): - """save audio with scipy.io.wavfile in 16bit integers""" - wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, self.sample_rate, - wav_norm.as_type(np.int16)) - - def load_wav(self, path, sr=None): - """load wav -> trim_silence -> rescale""" - - x, sr = librosa.load(path, sr=None) - assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format( - sr, self.sample_rate) - if self.do_trim_silence: - try: - x = self.trim_silence(x) - except ValueError: - print(" [!] File cannot be trimmed for silence - {}".format( - path)) - if self.sound_norm: - x = x / x.max() * 0.9 # why 0.9 ? - return x - - def trim_silence(self, wav): - """Trim soilent parts with a threshold and 0.01s margin""" - margin = int(self.sample_rate * 0.01) - wav = wav[margin:-margin] - trimed_wav = librosa.effects.trim( + def stft(self, wav): + D = librosa.core.stft( wav, - top_db=60, - frame_length=self.win_length, - hop_length=self.hop_length)[0] - return trimed_wav - - def apply_preemphasis(self, x): - if self.preemphasis == 0.: - raise RuntimeError( - " !! Preemphasis coefficient should be positive. ") - return scipy.signal.lfilter([1., -self.preemphasis], [1.], x) - - def apply_inv_preemphasis(self, x): - if self.preemphasis == 0.: - raise RuntimeError( - " !! Preemphasis coefficient should be positive. ") - return scipy.signal.lfilter([1.], [1., -self.preemphasis], x) - - def _amplitude_to_db(self, x): - amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) - return 20 * np.log10(np.maximum(amplitude_min, x)) - - @staticmethod - def _db_to_amplitude(x): - return np.power(10., 0.05 * x) - - def _linear_to_mel(self, spectrogram): - _mel_basis = self._build_mel_basis() - return np.dot(_mel_basis, spectrogram) - - def _mel_to_linear(self, mel_spectrogram): - inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) - return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram)) - - def _build_mel_basis(self): - """return mel basis for mel scale""" - if self.mel_fmax is not None: - assert self.mel_fmax <= self.sample_rate // 2 - return librosa.filters.mel(self.sample_rate, - self.n_fft, - n_mels=self.num_mels, - fmin=self.mel_fmin, - fmax=self.mel_fmax) - - def _normalize(self, S): - """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]""" - if self.signal_norm: - S_norm = (S - self.min_level_db) / (-self.min_level_db) - if self.symmetric_norm: - S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm - if self.clip_norm: - S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) - return S_norm - else: - S_norm = self.max_norm * S_norm - if self.clip_norm: - S_norm = np.clip(S_norm, 0, self.max_norm) - return S_norm - else: - return S - - def _denormalize(self, S): - """denormalize values""" - S_denorm = S - if self.signal_norm: - if self.symmetric_norm: - if self.clip_norm: - S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) - S_denorm = (S_denorm + self.max_norm) * ( - -self.min_level_db) / (2 * self.max_norm - ) + self.min_level_db - return S_denorm - else: - if self.clip_norm: - S_denorm = np.clip(S_denorm, 0, self.max_norm) - S_denorm = S_denorm * (-self.min_level_db - ) / self.max_norm + self.min_level_db - return S_denorm - else: - return S - - def _stft(self, y): - return librosa.stft( - y=y, - n_fft=self.n_fft, + n_fft = self.n_fft, + hop_length=self.hop_length, win_length=self.win_length, - hop_length=self.hop_length) + window=self.window, + center=self.center, + pad_mode=self.pad_mode) + return D - def _istft(self, S): - return librosa.istft( - S, hop_length=self.hop_length, win_length=self.win_length) + def istft(self, D): + wav = librosa.core.istft( + D, + hop_length=self.hop_length, + win_length=self.win_length, + window=self.window, + center=self.center) + return wav - def spectrogram(self, y): - """compute linear spectrogram(amplitude) - preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize - """ - if self.preemphasis: - D = self._stft(self.apply_preemphasis(y)) - else: - D = self._stft(y) - S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db - return self._normalize(S) + def spectrogram(self, wav): + D = self.stft(wav) + return np.abs(D) - def melspectrogram(self, y): - """compute linear spectrogram(amplitude) - preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize - """ - if self.preemphasis: - D = self._stft(self.apply_preemphasis(y)) - else: - D = self._stft(y) - S = self._amplitude_to_db(self._linear_to_mel(np.abs( - D))) - self.ref_level_db - return self._normalize(S) - - def inv_spectrogram(self, spectrogram): - """convert spectrogram back to waveform using griffin_lim in librosa""" - S = self._denormalize(spectrogram) - S = self._db_to_amplitude(S + self.ref_level_db) - if self.preemphasis: - return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - return self._griffin_lim(S**self.power) - - def inv_melspectrogram(self, mel_spectrogram): - S = self._denormalize(mel_spectrogram) - S = self._db_to_amplitude(S + self.ref_level_db) - S = self._mel_to_linear(np.abs(S)) - if self.preemphasis: - return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - return self._griffin_lim(S**self.power) - - def out_linear_to_mel(self, linear_spec): - """convert output linear spec to mel spec""" - S = self._denormalize(linear_spec) - S = self._db_to_amplitude(S + self.ref_level_db) - S = self._linear_to_mel(np.abs(S)) - S = self._amplitude_to_db(S) - self.ref_level_db - mel = self._normalize(S) + def mel_spectrogram(self, wav): + S = self.spectrogram(wav) + mel = np.dot(self.mel_filter, S) return mel - - def _griffin_lim(self, S): - angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) - S_complex = np.abs(S).astype(np.complex) - y = self._istft(S_complex * angles) - for _ in range(self.griffin_lim_iters): - angles = np.exp(1j * np.angle(self._stft(y))) - y = self._istft(S_complex * angles) - return y - - @staticmethod - def mulaw_encode(wav, qc): - mu = 2**qc - 1 - # wav_abs = np.minimum(np.abs(wav), 1.0) - signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) - # Quantize signal to the specified number of levels. - signal = (signal + 1) / 2 * mu + 0.5 - return np.floor(signal, ) - - @staticmethod - def mulaw_decode(wav, qc): - """Recovers waveform from quantized values.""" - mu = 2**qc - 1 - x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1) - return x - - @staticmethod - def encode_16bits(x): - return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) - - @staticmethod - def quantize(x, bits): - return (x + 1.) * (2**bits - 1) / 2 - - @staticmethod - def dequantize(x, bits): - return 2 * x / (2**bits - 1) - 1 diff --git a/parakeet/audio/spec_normalizer.py b/parakeet/audio/spec_normalizer.py index 9eefb53..793aeef 100644 --- a/parakeet/audio/spec_normalizer.py +++ b/parakeet/audio/spec_normalizer.py @@ -7,6 +7,9 @@ the generated spectrogram so as to be used with vocoders like griffin lim. The base class describe the interface. `transform` is used to perform transformation and `inverse` is used to perform the inverse transformation. + +check issues: +https://github.com/mozilla/TTS/issues/377 """ import numpy as np @@ -18,6 +21,9 @@ class NormalizerBase(object): raise NotImplementedError("inverse must be implemented") class LogMagnitude(NormalizerBase): + """ + This is a simple normalizer used in Waveglow, Waveflow, tacotron2... + """ def __init__(self, min=1e-7): self.min = min @@ -28,7 +34,11 @@ class LogMagnitude(NormalizerBase): def inverse(self, x): return np.exp(x) - + + class UnitMagnitude(NormalizerBase): # dbscale and (0, 1) normalization + """ + This is the normalizer used in the + """ pass \ No newline at end of file diff --git a/parakeet/data/datasets.py b/parakeet/data/datasets.py index 023577d..35e18ab 100644 --- a/parakeet/data/datasets.py +++ b/parakeet/data/datasets.py @@ -1,7 +1,7 @@ from paddle.io import Dataset - from os import listdir from os.path import splitext, join +from pathlib import Path import librosa class AudioFolderDataset(Dataset): @@ -19,4 +19,26 @@ class AudioFolderDataset(Dataset): def __getitem__(self, i): file_name = self.file_names[i] y, sr = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable - return y \ No newline at end of file + return y + + +class LJSpeechMetaData(Dataset): + def __init__(self, root): + self.root = Path(root).expanduser() + wav_dir = self.root / "wavs" + csv_path = self.root / "metadata.csv" + records = [] + speaker_name = "ljspeech" + with open(str(csv_path), 'rt') as f: + for line in f: + filename, _, normalized_text = line.strip().split("|") + filename = str(wav_dir / (filename + ".wav")) + records.append([filename, normalized_text, speaker_name]) + self.records = records + + def __getitem__(self, i): + return self.records[i] + + def __len__(self): + return len(self.records) + diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index 34be5fc..97748fe 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -27,7 +27,12 @@ class English(Phonetics): self.vocab = Vocab(self.phonemes + self.punctuations) def phoneticize(self, sentence): - return self.backend(sentence) + start = self.vocab.start_symbol + end = self.vocab.end_symbol + phonemes = ([] if start is None else [start]) \ + + self.backend(sentence) \ + + ([] if end is None else [end]) + return phonemes def numericalize(self, phonemes): ids = [self.vocab.lookup(item) for item in phonemes if item in self.vocab.stoi] @@ -58,6 +63,11 @@ class Chinese(Phonetics): def phoneticize(self, sentence): simplified = self.opencc_backend.convert(sentence) phonemes = self.backend(simplified) + start = self.vocab.start_symbol + end = self.vocab.end_symbol + phonemes = ([] if start is None else [start]) \ + + phonemes \ + + ([] if end is None else [end]) return self._filter_symbols(phonemes) def _filter_symbols(self, phonemes): diff --git a/parakeet/frontend/vocab.py b/parakeet/frontend/vocab.py index 3dbf316..e773ac8 100644 --- a/parakeet/frontend/vocab.py +++ b/parakeet/frontend/vocab.py @@ -22,11 +22,10 @@ class Vocab(object): self.stoi = OrderedDict() self.stoi.update(self.special_symbols) - N = len(self.special_symbols) for i, s in enumerate(symbols): if s not in self.stoi: - self.stoi[s] = N +i + self.stoi[s] = len(self.stoi) self.itos = {v: k for k, v in self.stoi.items()} def __len__(self): diff --git a/parakeet/models/deepvoice3.py b/parakeet/models/deepvoice3.py index e44edcb..896c119 100644 --- a/parakeet/models/deepvoice3.py +++ b/parakeet/models/deepvoice3.py @@ -21,7 +21,7 @@ class ConvBlock(nn.Layer): std = math.sqrt(4 * keep_prob / (kernel_size * in_channel)) padding = "valid" if causal else "same" - conv = nn.Conv1d(in_channel, 2 * in_channel, (kernel_size, ), + conv = nn.Conv1D(in_channel, 2 * in_channel, (kernel_size, ), padding=padding, data_format="NLC", weight_attr=I.Normal(scale=std)) diff --git a/parakeet/models/transformer_tts.py b/parakeet/models/transformer_tts.py index 86993dd..e4b6e7a 100644 --- a/parakeet/models/transformer_tts.py +++ b/parakeet/models/transformer_tts.py @@ -2,11 +2,12 @@ import math import paddle from paddle import nn from paddle.nn import functional as F +from paddle.nn import initializer as I from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention from parakeet.modules.transformer import PositionwiseFFN from parakeet.modules import masking -from parakeet.modules.cbhg import Conv1dBatchNorm +from parakeet.modules.conv import Conv1dBatchNorm from parakeet.modules import positional_encoding as pe __all__ = ["TransformerTTS"] @@ -21,7 +22,7 @@ class MultiheadAttention(nn.Layer): Another deviation is that it concats the input query and context vector before applying the output projection. """ - def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None): + def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None, k_input_dim=None, v_input_dim=None): """ Args: model_dim (int): the feature size of query. @@ -42,9 +43,11 @@ class MultiheadAttention(nn.Layer): depth = model_dim // num_heads k_dim = k_dim or depth v_dim = v_dim or depth + k_input_dim = k_input_dim or model_dim + v_input_dim = v_input_dim or model_dim self.affine_q = nn.Linear(model_dim, num_heads * k_dim) - self.affine_k = nn.Linear(model_dim, num_heads * k_dim) - self.affine_v = nn.Linear(model_dim, num_heads * v_dim) + self.affine_k = nn.Linear(k_input_dim, num_heads * k_dim) + self.affine_v = nn.Linear(v_input_dim, num_heads * v_dim) self.affine_o = nn.Linear(model_dim + num_heads * v_dim, model_dim) self.num_heads = num_heads @@ -128,7 +131,7 @@ class TransformerDecoderLayer(nn.Layer): """ Transformer decoder layer. """ - def __init__(self, d_model, n_heads, d_ffn, dropout=0.): + def __init__(self, d_model, n_heads, d_ffn, dropout=0., d_encoder=None): """ Args: d_model (int): the feature size of the input, and the output. @@ -141,7 +144,7 @@ class TransformerDecoderLayer(nn.Layer): self.self_mha = MultiheadAttention(d_model, n_heads) self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - self.cross_mha = MultiheadAttention(d_model, n_heads) + self.cross_mha = MultiheadAttention(d_model, n_heads, k_input_dim=d_encoder, v_input_dim=d_encoder) self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) @@ -194,10 +197,10 @@ class TransformerEncoder(nn.LayerList): class TransformerDecoder(nn.LayerList): - def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0.): + def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0., d_encoder=None): super(TransformerDecoder, self).__init__() for _ in range(n_layers): - self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout)) + self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout, d_encoder=d_encoder)) def forward(self, q, k, v, encoder_mask, decoder_mask): self_attention_weights = [] @@ -233,7 +236,7 @@ class CNNPostNet(nn.Layer): c_out = d_output if i == n_layers - 1 else d_hidden self.convs.append( Conv1dBatchNorm(c_in, c_out, kernel_size, padding=padding)) - self.last_norm = nn.BatchNorm1d(d_output) + self.last_norm = nn.BatchNorm1D(d_output) def forward(self, x): x_in = x @@ -244,44 +247,51 @@ class CNNPostNet(nn.Layer): class TransformerTTS(nn.Layer): - def __init__(self, vocab_size, padding_idx, d_model, d_mel, n_heads, d_ffn, positional_encoding_scalar, + def __init__(self, vocab_size, padding_idx, d_encoder, d_decoder, d_mel, n_heads, d_ffn, encoder_layers, decoder_layers, d_prenet, d_postnet, postnet_layers, postnet_kernel_size, max_reduction_factor, dropout): super(TransformerTTS, self).__init__() - self.encoder_prenet = nn.Embedding(vocab_size, d_model, padding_idx) - self.encoder_pe = pe.positional_encoding(0, 1000, d_model) # it may be extended later - self.encoder = TransformerEncoder(d_model, n_heads, d_ffn, encoder_layers, dropout) + # initial pe scalar is 1, though it is trainable + self.pe_scalar = self.create_parameter([1], attr=I.Constant(1.)) - self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_model, dropout) - self.decoder_pe = pe.positional_encoding(0, 1000, d_model) # it may be extended later - self.decoder = TransformerDecoder(d_model, n_heads, d_ffn, decoder_layers, dropout) - self.final_proj = nn.Linear(d_model, max_reduction_factor * d_mel) + # encoder + self.encoder_prenet = nn.Embedding(vocab_size, d_encoder, padding_idx) + self.encoder_pe = pe.positional_encoding(0, 1000, d_encoder) # it may be extended later + self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn, encoder_layers, dropout) + + # decoder + self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout) + self.decoder_pe = pe.positional_encoding(0, 1000, d_decoder) # it may be extended later + self.decoder = TransformerDecoder(d_decoder, n_heads, d_ffn, decoder_layers, dropout, d_encoder=d_encoder) + self.final_proj = nn.Linear(d_decoder, max_reduction_factor * d_mel) self.decoder_postnet = CNNPostNet(d_mel, d_postnet, d_mel, postnet_kernel_size, postnet_layers) self.stop_conditioner = nn.Linear(d_mel, 3) # specs self.padding_idx = padding_idx - self.d_model = d_model - self.pe_scalar = positional_encoding_scalar + self.d_encoder = d_encoder + self.d_decoder = d_decoder - # start and end + # start and end: though it is only used in predict + # it can also be used in training dtype = paddle.get_default_dtype() - self.start_vec = paddle.fill_constant([1, d_mel], dtype=dtype, value=0) - self.end_vec = paddle.fill_constant([1, d_mel], dtype=dtype, value=0) + self.start_vec = paddle.full([1, d_mel], 0, dtype=dtype) + self.end_vec = paddle.full([1, d_mel], 0, dtype=dtype) self.stop_prob_index = 2 self.max_r = max_reduction_factor self.r = max_reduction_factor # set it every call - def forward(self, text, mel, stop): - pass - + encoded, encoder_attention_weights, encoder_mask = self.encode(text) + mel_output, mel_intermediate, cross_attention_weights, stop_logits = self.decode(encoded, mel, encoder_mask) + return mel_output, mel_intermediate, encoder_attention_weights, cross_attention_weights + def encode(self, text): T_enc = text.shape[-1] embed = self.encoder_prenet(text) pe = self.encoder_pe[:T_enc, :] # (T, C) - x = embed.scale(math.sqrt(self.d_model)) + pe.scale(self.pe_scalar) + x = embed.scale(math.sqrt(self.d_encoder)) + pe * self.pe_scalar encoder_padding_mask = masking.id_mask(text, self.padding_idx, dtype=x.dtype) x = F.dropout(x, training=self.training) @@ -341,8 +351,13 @@ class TransformerTTS(nn.Layer): break return decoder_output[:, 1:, :], encoder_attentions, cross_attention_weights - - - - - \ No newline at end of file + + +class TransformerTTSLoss(nn.Layer): + def __init__(self, stop_loss_scale): + super(TransformerTTSLoss, self).__init__() + self.stop_loss_scale = stop_loss_scale + + def forward(self, ): + + return loss, details \ No newline at end of file diff --git a/parakeet/modules/cbhg.py b/parakeet/modules/cbhg.py index 39edfe3..03bc108 100644 --- a/parakeet/modules/cbhg.py +++ b/parakeet/modules/cbhg.py @@ -40,7 +40,7 @@ class CBHG(nn.Layer): proj_out_channels = projection_channels + \ [in_channels] # ensure residual connection for c_in, c_out in zip(proj_in_channels, proj_out_channels): - conv = nn.Conv1d(c_in, c_out, (3,), padding=(1, 1)) + conv = nn.Conv1D(c_in, c_out, (3,), padding=(1, 1)) self.projections.append(conv) if in_channels != highway_features: diff --git a/parakeet/modules/conv.py b/parakeet/modules/conv.py index a36d2f6..e50d95a 100644 --- a/parakeet/modules/conv.py +++ b/parakeet/modules/conv.py @@ -1,7 +1,7 @@ import paddle from paddle import nn -class Conv1dCell(nn.Conv1d): +class Conv1dCell(nn.Conv1D): """ A subclass of Conv1d layer, which can be used like an RNN cell. It can take step input and return step output. It is done by keeping an internal buffer, @@ -86,12 +86,12 @@ class Conv1dBatchNorm(nn.Layer): weight_attr=None, bias_attr=None): super(Conv1dBatchNorm, self).__init__() # TODO(chenfeiyu): carefully initialize Conv1d's weight - self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, + self.conv = nn.Conv1D(in_channels, out_channels, kernel_size, stride, padding=padding, weight_attr=weight_attr, bias_attr=bias_attr) # TODO: channel last, but BatchNorm1d does not support channel last layout - self.bn = nn.BatchNorm1d(out_channels) + self.bn = nn.BatchNorm1D(out_channels) def forward(self, x): return self.bn(self.conv(x)) diff --git a/parakeet/modules/losses.py b/parakeet/modules/losses.py new file mode 100644 index 0000000..ad29e0d --- /dev/null +++ b/parakeet/modules/losses.py @@ -0,0 +1,31 @@ +import paddle +from paddle import nn +from paddle.nn import functional as F + +def weighted_mean(input, weight): + """weighted mean.(It can also be used as masked mean.) + + Args: + input (Tensor): input tensor, floating point dtype. + weight (Tensor): weight tensor with broadcastable shape. + + Returns: + Tensor: shape(1,), weighted mean tensor with the same dtype as input. + """ + weight = paddle.cast(weight, input.dtype) + broadcast_factor = input.numel() / weight.numel() + return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_factor) + +def masked_l1_loss(prediction, target, mask): + abs_error = F.l1_loss(prediction, target, reduction='none') + return weighted_mean(abs_error, mask) + +def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1): + ce = F.softmax_with_cross_entropy(logits, label, axis=axis) + return weighted_mean(ce, mask) + + + + + + diff --git a/tests/test_losses.py b/tests/test_losses.py new file mode 100644 index 0000000..fa38eee --- /dev/null +++ b/tests/test_losses.py @@ -0,0 +1,33 @@ +import unittest +import paddle +paddle.set_device("cpu") +import numpy as np + +from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy + +class TestWeightedMean(unittest.TestCase): + def test(self): + x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3]) + mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1) + loss = weighted_mean(x, mask) + self.assertAlmostEqual(loss.numpy()[0], 7) + + +class TestMaskedL1Loss(unittest.TestCase): + def test(self): + x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3]) + y = paddle.zeros_like(x) + mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1) + loss = masked_l1_loss(x, y, mask) + print(loss) + self.assertAlmostEqual(loss.numpy()[0], 7) + + +class TestMaskedCrossEntropy(unittest.TestCase): + def test(self): + x = paddle.randn([3, 30, 8], dtype="float64") + y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this + mask = paddle.fluid.layers.sequence_mask( + paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1) + loss = masked_softmax_with_cross_entropy(x, y, mask) + print(loss) diff --git a/tests/test_transformer_tts.py b/tests/test_transformer_tts.py index 04676bc..236c06e 100644 --- a/tests/test_transformer_tts.py +++ b/tests/test_transformer_tts.py @@ -63,8 +63,7 @@ class TestTransformerDecoderLayer(unittest.TestCase): class TestTransformerTTS(unittest.TestCase): def setUp(self): net = tts.TransformerTTS( - 128, 0, 64, 80, 4, 128, - 0.5, + 128, 0, 64, 128, 80, 4, 128, 6, 6, 128, 128, 4, 3, 10, 0.5) self.net = net