1. API renaming Conv1d -> Conv1D, BatchNorm1d -> BatchNorm1D;
2. add losses in parakeet/modules; 3. fix a bug in phonetics; 4. TransformerTTS update: encoder dim can be different from decoder dim; 5. MultiHeadAttention in TransformerTTS: add k_input_dim & v_input_dim in __init__ to allow differemt feature sizes for k and v.
This commit is contained in:
parent
2a764d9a10
commit
c43216ae9b
|
@ -4,6 +4,9 @@
|
|||
*.udb
|
||||
*.ann
|
||||
|
||||
# data
|
||||
datasets/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
|
|
@ -13,3 +13,4 @@
|
|||
# limitations under the License.
|
||||
|
||||
from .audio import AudioProcessor
|
||||
from .spec_normalizer import NormalizerBase, LogMagnitude
|
|
@ -15,278 +15,80 @@
|
|||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import scipy.io
|
||||
import scipy.signal
|
||||
|
||||
|
||||
class AudioProcessor(object):
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate=None, # int, sampling rate
|
||||
num_mels=None, # int, bands of mel spectrogram
|
||||
min_level_db=None, # float, minimum level db
|
||||
ref_level_db=None, # float, reference level db
|
||||
n_fft=None, # int: number of samples in a frame for stft
|
||||
win_length=None, # int: the same meaning with n_fft
|
||||
hop_length=None, # int: number of samples between neighboring frame
|
||||
power=None, # float:power to raise before griffin-lim
|
||||
preemphasis=None, # float: preemphasis coefficident
|
||||
signal_norm=None, #
|
||||
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
|
||||
max_norm=None, # float, max norm
|
||||
mel_fmin=None, # int: mel spectrogram's minimum frequency
|
||||
mel_fmax=None, # int: mel spectrogram's maximum frequency
|
||||
clip_norm=True, # bool: clip spectrogram's norm
|
||||
griffin_lim_iters=None, # int:
|
||||
do_trim_silence=False, # bool: trim silence
|
||||
sound_norm=False,
|
||||
**kwargs):
|
||||
def __init__(self,
|
||||
sample_rate:int,
|
||||
n_fft:int,
|
||||
win_length:int,
|
||||
hop_length:int,
|
||||
n_mels:int=80,
|
||||
f_min:int=0,
|
||||
f_max:int=None,
|
||||
window="hann",
|
||||
center="True",
|
||||
pad_mode="reflect"):
|
||||
# read & write
|
||||
self.sample_rate = sample_rate
|
||||
self.num_mels = num_mels
|
||||
self.min_level_db = min_level_db
|
||||
self.ref_level_db = ref_level_db
|
||||
|
||||
# stft related
|
||||
# stft
|
||||
self.n_fft = n_fft
|
||||
self.win_length = win_length or n_fft
|
||||
# hop length defaults to 1/4 window_length
|
||||
self.hop_length = hop_length or 0.25 * self.win_length
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
self.window = window
|
||||
self.center = center
|
||||
self.pad_mode = pad_mode
|
||||
|
||||
self.power = power
|
||||
self.preemphasis = float(preemphasis)
|
||||
# mel
|
||||
self.n_mels = n_mels
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
|
||||
self.griffin_lim_iters = griffin_lim_iters
|
||||
self.signal_norm = signal_norm
|
||||
self.symmetric_norm = symmetric_norm
|
||||
self.mel_filter = self._create_mel_filter()
|
||||
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
|
||||
|
||||
# mel transform related
|
||||
self.mel_fmin = mel_fmin
|
||||
self.mel_fmax = mel_fmax
|
||||
|
||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
|
||||
self.sound_norm = sound_norm
|
||||
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
|
||||
)
|
||||
|
||||
def _stft_parameters(self):
|
||||
"""compute frame length and hop length in ms"""
|
||||
frame_length_ms = self.win_length * 1. / self.sample_rate
|
||||
frame_shift_ms = self.hop_length * 1. / self.sample_rate
|
||||
num_freq = 1 + self.n_fft // 2
|
||||
return num_freq, frame_length_ms, frame_shift_ms
|
||||
|
||||
def __repr__(self):
|
||||
"""object repr"""
|
||||
cls_name_str = self.__class__.__name__
|
||||
members = vars(self)
|
||||
dict_str = "\n".join(
|
||||
[" {}: {},".format(k, v) for k, v in members.items()])
|
||||
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
|
||||
return repr_str
|
||||
|
||||
def save_wav(self, path, wav):
|
||||
"""save audio with scipy.io.wavfile in 16bit integers"""
|
||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||
scipy.io.wavfile.write(path, self.sample_rate,
|
||||
wav_norm.as_type(np.int16))
|
||||
|
||||
def load_wav(self, path, sr=None):
|
||||
"""load wav -> trim_silence -> rescale"""
|
||||
|
||||
x, sr = librosa.load(path, sr=None)
|
||||
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
|
||||
sr, self.sample_rate)
|
||||
if self.do_trim_silence:
|
||||
try:
|
||||
x = self.trim_silence(x)
|
||||
except ValueError:
|
||||
print(" [!] File cannot be trimmed for silence - {}".format(
|
||||
path))
|
||||
if self.sound_norm:
|
||||
x = x / x.max() * 0.9 # why 0.9 ?
|
||||
return x
|
||||
|
||||
def trim_silence(self, wav):
|
||||
"""Trim soilent parts with a threshold and 0.01s margin"""
|
||||
margin = int(self.sample_rate * 0.01)
|
||||
wav = wav[margin:-margin]
|
||||
trimed_wav = librosa.effects.trim(
|
||||
wav,
|
||||
top_db=60,
|
||||
frame_length=self.win_length,
|
||||
hop_length=self.hop_length)[0]
|
||||
return trimed_wav
|
||||
|
||||
def apply_preemphasis(self, x):
|
||||
if self.preemphasis == 0.:
|
||||
raise RuntimeError(
|
||||
" !! Preemphasis coefficient should be positive. ")
|
||||
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
|
||||
|
||||
def apply_inv_preemphasis(self, x):
|
||||
if self.preemphasis == 0.:
|
||||
raise RuntimeError(
|
||||
" !! Preemphasis coefficient should be positive. ")
|
||||
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
|
||||
|
||||
def _amplitude_to_db(self, x):
|
||||
amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))
|
||||
return 20 * np.log10(np.maximum(amplitude_min, x))
|
||||
|
||||
@staticmethod
|
||||
def _db_to_amplitude(x):
|
||||
return np.power(10., 0.05 * x)
|
||||
|
||||
def _linear_to_mel(self, spectrogram):
|
||||
_mel_basis = self._build_mel_basis()
|
||||
return np.dot(_mel_basis, spectrogram)
|
||||
|
||||
def _mel_to_linear(self, mel_spectrogram):
|
||||
inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
|
||||
return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram))
|
||||
|
||||
def _build_mel_basis(self):
|
||||
"""return mel basis for mel scale"""
|
||||
if self.mel_fmax is not None:
|
||||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
return librosa.filters.mel(self.sample_rate,
|
||||
def _create_mel_filter(self):
|
||||
mel_filter = librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax)
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.f_min,
|
||||
fmax=self.f_max)
|
||||
return mel_filter
|
||||
|
||||
def _normalize(self, S):
|
||||
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
|
||||
if self.signal_norm:
|
||||
S_norm = (S - self.min_level_db) / (-self.min_level_db)
|
||||
if self.symmetric_norm:
|
||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
S_norm = self.max_norm * S_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, 0, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
return S
|
||||
def read_wav(self, filename):
|
||||
# resampling may occur
|
||||
wav, _ = librosa.load(filename, sr=self.sample_rate)
|
||||
return wav
|
||||
|
||||
def _denormalize(self, S):
|
||||
"""denormalize values"""
|
||||
S_denorm = S
|
||||
if self.signal_norm:
|
||||
if self.symmetric_norm:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
|
||||
S_denorm = (S_denorm + self.max_norm) * (
|
||||
-self.min_level_db) / (2 * self.max_norm
|
||||
) + self.min_level_db
|
||||
return S_denorm
|
||||
else:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, 0, self.max_norm)
|
||||
S_denorm = S_denorm * (-self.min_level_db
|
||||
) / self.max_norm + self.min_level_db
|
||||
return S_denorm
|
||||
else:
|
||||
return S
|
||||
def write_wav(self, path, wav):
|
||||
sf.write(path, wav, samplerate=self.sample_rate)
|
||||
|
||||
def _stft(self, y):
|
||||
return librosa.stft(
|
||||
y=y,
|
||||
def stft(self, wav):
|
||||
D = librosa.core.stft(
|
||||
wav,
|
||||
n_fft = self.n_fft,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
window=self.window,
|
||||
center=self.center,
|
||||
pad_mode=self.pad_mode)
|
||||
return D
|
||||
|
||||
def _istft(self, S):
|
||||
return librosa.istft(
|
||||
S, hop_length=self.hop_length, win_length=self.win_length)
|
||||
def istft(self, D):
|
||||
wav = librosa.core.istft(
|
||||
D,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
window=self.window,
|
||||
center=self.center)
|
||||
return wav
|
||||
|
||||
def spectrogram(self, y):
|
||||
"""compute linear spectrogram(amplitude)
|
||||
preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize
|
||||
"""
|
||||
if self.preemphasis:
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db
|
||||
return self._normalize(S)
|
||||
def spectrogram(self, wav):
|
||||
D = self.stft(wav)
|
||||
return np.abs(D)
|
||||
|
||||
def melspectrogram(self, y):
|
||||
"""compute linear spectrogram(amplitude)
|
||||
preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize
|
||||
"""
|
||||
if self.preemphasis:
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amplitude_to_db(self._linear_to_mel(np.abs(
|
||||
D))) - self.ref_level_db
|
||||
return self._normalize(S)
|
||||
|
||||
def inv_spectrogram(self, spectrogram):
|
||||
"""convert spectrogram back to waveform using griffin_lim in librosa"""
|
||||
S = self._denormalize(spectrogram)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
if self.preemphasis:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def inv_melspectrogram(self, mel_spectrogram):
|
||||
S = self._denormalize(mel_spectrogram)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
S = self._mel_to_linear(np.abs(S))
|
||||
if self.preemphasis:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def out_linear_to_mel(self, linear_spec):
|
||||
"""convert output linear spec to mel spec"""
|
||||
S = self._denormalize(linear_spec)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
S = self._linear_to_mel(np.abs(S))
|
||||
S = self._amplitude_to_db(S) - self.ref_level_db
|
||||
mel = self._normalize(S)
|
||||
def mel_spectrogram(self, wav):
|
||||
S = self.spectrogram(wav)
|
||||
mel = np.dot(self.mel_filter, S)
|
||||
return mel
|
||||
|
||||
def _griffin_lim(self, S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = self._istft(S_complex * angles)
|
||||
for _ in range(self.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||
y = self._istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def mulaw_encode(wav, qc):
|
||||
mu = 2**qc - 1
|
||||
# wav_abs = np.minimum(np.abs(wav), 1.0)
|
||||
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
|
||||
# Quantize signal to the specified number of levels.
|
||||
signal = (signal + 1) / 2 * mu + 0.5
|
||||
return np.floor(signal, )
|
||||
|
||||
@staticmethod
|
||||
def mulaw_decode(wav, qc):
|
||||
"""Recovers waveform from quantized values."""
|
||||
mu = 2**qc - 1
|
||||
x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def encode_16bits(x):
|
||||
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
|
||||
|
||||
@staticmethod
|
||||
def quantize(x, bits):
|
||||
return (x + 1.) * (2**bits - 1) / 2
|
||||
|
||||
@staticmethod
|
||||
def dequantize(x, bits):
|
||||
return 2 * x / (2**bits - 1) - 1
|
||||
|
|
|
@ -7,6 +7,9 @@ the generated spectrogram so as to be used with vocoders like griffin lim.
|
|||
|
||||
The base class describe the interface. `transform` is used to perform
|
||||
transformation and `inverse` is used to perform the inverse transformation.
|
||||
|
||||
check issues:
|
||||
https://github.com/mozilla/TTS/issues/377
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
@ -18,6 +21,9 @@ class NormalizerBase(object):
|
|||
raise NotImplementedError("inverse must be implemented")
|
||||
|
||||
class LogMagnitude(NormalizerBase):
|
||||
"""
|
||||
This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
|
||||
"""
|
||||
def __init__(self, min=1e-7):
|
||||
self.min = min
|
||||
|
||||
|
@ -29,6 +35,10 @@ class LogMagnitude(NormalizerBase):
|
|||
def inverse(self, x):
|
||||
return np.exp(x)
|
||||
|
||||
|
||||
class UnitMagnitude(NormalizerBase):
|
||||
# dbscale and (0, 1) normalization
|
||||
"""
|
||||
This is the normalizer used in the
|
||||
"""
|
||||
pass
|
|
@ -1,7 +1,7 @@
|
|||
from paddle.io import Dataset
|
||||
|
||||
from os import listdir
|
||||
from os.path import splitext, join
|
||||
from pathlib import Path
|
||||
import librosa
|
||||
|
||||
class AudioFolderDataset(Dataset):
|
||||
|
@ -20,3 +20,25 @@ class AudioFolderDataset(Dataset):
|
|||
file_name = self.file_names[i]
|
||||
y, sr = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable
|
||||
return y
|
||||
|
||||
|
||||
class LJSpeechMetaData(Dataset):
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
wav_dir = self.root / "wavs"
|
||||
csv_path = self.root / "metadata.csv"
|
||||
records = []
|
||||
speaker_name = "ljspeech"
|
||||
with open(str(csv_path), 'rt') as f:
|
||||
for line in f:
|
||||
filename, _, normalized_text = line.strip().split("|")
|
||||
filename = str(wav_dir / (filename + ".wav"))
|
||||
records.append([filename, normalized_text, speaker_name])
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.records[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
|
|
@ -27,7 +27,12 @@ class English(Phonetics):
|
|||
self.vocab = Vocab(self.phonemes + self.punctuations)
|
||||
|
||||
def phoneticize(self, sentence):
|
||||
return self.backend(sentence)
|
||||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
phonemes = ([] if start is None else [start]) \
|
||||
+ self.backend(sentence) \
|
||||
+ ([] if end is None else [end])
|
||||
return phonemes
|
||||
|
||||
def numericalize(self, phonemes):
|
||||
ids = [self.vocab.lookup(item) for item in phonemes if item in self.vocab.stoi]
|
||||
|
@ -58,6 +63,11 @@ class Chinese(Phonetics):
|
|||
def phoneticize(self, sentence):
|
||||
simplified = self.opencc_backend.convert(sentence)
|
||||
phonemes = self.backend(simplified)
|
||||
start = self.vocab.start_symbol
|
||||
end = self.vocab.end_symbol
|
||||
phonemes = ([] if start is None else [start]) \
|
||||
+ phonemes \
|
||||
+ ([] if end is None else [end])
|
||||
return self._filter_symbols(phonemes)
|
||||
|
||||
def _filter_symbols(self, phonemes):
|
||||
|
|
|
@ -22,11 +22,10 @@ class Vocab(object):
|
|||
|
||||
self.stoi = OrderedDict()
|
||||
self.stoi.update(self.special_symbols)
|
||||
N = len(self.special_symbols)
|
||||
|
||||
for i, s in enumerate(symbols):
|
||||
if s not in self.stoi:
|
||||
self.stoi[s] = N +i
|
||||
self.stoi[s] = len(self.stoi)
|
||||
self.itos = {v: k for k, v in self.stoi.items()}
|
||||
|
||||
def __len__(self):
|
||||
|
|
|
@ -21,7 +21,7 @@ class ConvBlock(nn.Layer):
|
|||
|
||||
std = math.sqrt(4 * keep_prob / (kernel_size * in_channel))
|
||||
padding = "valid" if causal else "same"
|
||||
conv = nn.Conv1d(in_channel, 2 * in_channel, (kernel_size, ),
|
||||
conv = nn.Conv1D(in_channel, 2 * in_channel, (kernel_size, ),
|
||||
padding=padding,
|
||||
data_format="NLC",
|
||||
weight_attr=I.Normal(scale=std))
|
||||
|
|
|
@ -2,11 +2,12 @@ import math
|
|||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention
|
||||
from parakeet.modules.transformer import PositionwiseFFN
|
||||
from parakeet.modules import masking
|
||||
from parakeet.modules.cbhg import Conv1dBatchNorm
|
||||
from parakeet.modules.conv import Conv1dBatchNorm
|
||||
from parakeet.modules import positional_encoding as pe
|
||||
|
||||
__all__ = ["TransformerTTS"]
|
||||
|
@ -21,7 +22,7 @@ class MultiheadAttention(nn.Layer):
|
|||
Another deviation is that it concats the input query and context vector before
|
||||
applying the output projection.
|
||||
"""
|
||||
def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None):
|
||||
def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None, k_input_dim=None, v_input_dim=None):
|
||||
"""
|
||||
Args:
|
||||
model_dim (int): the feature size of query.
|
||||
|
@ -42,9 +43,11 @@ class MultiheadAttention(nn.Layer):
|
|||
depth = model_dim // num_heads
|
||||
k_dim = k_dim or depth
|
||||
v_dim = v_dim or depth
|
||||
k_input_dim = k_input_dim or model_dim
|
||||
v_input_dim = v_input_dim or model_dim
|
||||
self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
|
||||
self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
|
||||
self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
|
||||
self.affine_k = nn.Linear(k_input_dim, num_heads * k_dim)
|
||||
self.affine_v = nn.Linear(v_input_dim, num_heads * v_dim)
|
||||
self.affine_o = nn.Linear(model_dim + num_heads * v_dim, model_dim)
|
||||
|
||||
self.num_heads = num_heads
|
||||
|
@ -128,7 +131,7 @@ class TransformerDecoderLayer(nn.Layer):
|
|||
"""
|
||||
Transformer decoder layer.
|
||||
"""
|
||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0., d_encoder=None):
|
||||
"""
|
||||
Args:
|
||||
d_model (int): the feature size of the input, and the output.
|
||||
|
@ -141,7 +144,7 @@ class TransformerDecoderLayer(nn.Layer):
|
|||
self.self_mha = MultiheadAttention(d_model, n_heads)
|
||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||
|
||||
self.cross_mha = MultiheadAttention(d_model, n_heads)
|
||||
self.cross_mha = MultiheadAttention(d_model, n_heads, k_input_dim=d_encoder, v_input_dim=d_encoder)
|
||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
||||
|
||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
||||
|
@ -194,10 +197,10 @@ class TransformerEncoder(nn.LayerList):
|
|||
|
||||
|
||||
class TransformerDecoder(nn.LayerList):
|
||||
def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0.):
|
||||
def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0., d_encoder=None):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
for _ in range(n_layers):
|
||||
self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout))
|
||||
self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout, d_encoder=d_encoder))
|
||||
|
||||
def forward(self, q, k, v, encoder_mask, decoder_mask):
|
||||
self_attention_weights = []
|
||||
|
@ -233,7 +236,7 @@ class CNNPostNet(nn.Layer):
|
|||
c_out = d_output if i == n_layers - 1 else d_hidden
|
||||
self.convs.append(
|
||||
Conv1dBatchNorm(c_in, c_out, kernel_size, padding=padding))
|
||||
self.last_norm = nn.BatchNorm1d(d_output)
|
||||
self.last_norm = nn.BatchNorm1D(d_output)
|
||||
|
||||
def forward(self, x):
|
||||
x_in = x
|
||||
|
@ -244,44 +247,51 @@ class CNNPostNet(nn.Layer):
|
|||
|
||||
|
||||
class TransformerTTS(nn.Layer):
|
||||
def __init__(self, vocab_size, padding_idx, d_model, d_mel, n_heads, d_ffn, positional_encoding_scalar,
|
||||
def __init__(self, vocab_size, padding_idx, d_encoder, d_decoder, d_mel, n_heads, d_ffn,
|
||||
encoder_layers, decoder_layers, d_prenet, d_postnet, postnet_layers,
|
||||
postnet_kernel_size, max_reduction_factor, dropout):
|
||||
super(TransformerTTS, self).__init__()
|
||||
self.encoder_prenet = nn.Embedding(vocab_size, d_model, padding_idx)
|
||||
self.encoder_pe = pe.positional_encoding(0, 1000, d_model) # it may be extended later
|
||||
self.encoder = TransformerEncoder(d_model, n_heads, d_ffn, encoder_layers, dropout)
|
||||
# initial pe scalar is 1, though it is trainable
|
||||
self.pe_scalar = self.create_parameter([1], attr=I.Constant(1.))
|
||||
|
||||
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_model, dropout)
|
||||
self.decoder_pe = pe.positional_encoding(0, 1000, d_model) # it may be extended later
|
||||
self.decoder = TransformerDecoder(d_model, n_heads, d_ffn, decoder_layers, dropout)
|
||||
self.final_proj = nn.Linear(d_model, max_reduction_factor * d_mel)
|
||||
# encoder
|
||||
self.encoder_prenet = nn.Embedding(vocab_size, d_encoder, padding_idx)
|
||||
self.encoder_pe = pe.positional_encoding(0, 1000, d_encoder) # it may be extended later
|
||||
self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn, encoder_layers, dropout)
|
||||
|
||||
# decoder
|
||||
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
|
||||
self.decoder_pe = pe.positional_encoding(0, 1000, d_decoder) # it may be extended later
|
||||
self.decoder = TransformerDecoder(d_decoder, n_heads, d_ffn, decoder_layers, dropout, d_encoder=d_encoder)
|
||||
self.final_proj = nn.Linear(d_decoder, max_reduction_factor * d_mel)
|
||||
self.decoder_postnet = CNNPostNet(d_mel, d_postnet, d_mel, postnet_kernel_size, postnet_layers)
|
||||
self.stop_conditioner = nn.Linear(d_mel, 3)
|
||||
|
||||
# specs
|
||||
self.padding_idx = padding_idx
|
||||
self.d_model = d_model
|
||||
self.pe_scalar = positional_encoding_scalar
|
||||
self.d_encoder = d_encoder
|
||||
self.d_decoder = d_decoder
|
||||
|
||||
# start and end
|
||||
# start and end: though it is only used in predict
|
||||
# it can also be used in training
|
||||
dtype = paddle.get_default_dtype()
|
||||
self.start_vec = paddle.fill_constant([1, d_mel], dtype=dtype, value=0)
|
||||
self.end_vec = paddle.fill_constant([1, d_mel], dtype=dtype, value=0)
|
||||
self.start_vec = paddle.full([1, d_mel], 0, dtype=dtype)
|
||||
self.end_vec = paddle.full([1, d_mel], 0, dtype=dtype)
|
||||
self.stop_prob_index = 2
|
||||
|
||||
self.max_r = max_reduction_factor
|
||||
self.r = max_reduction_factor # set it every call
|
||||
|
||||
|
||||
def forward(self, text, mel, stop):
|
||||
pass
|
||||
encoded, encoder_attention_weights, encoder_mask = self.encode(text)
|
||||
mel_output, mel_intermediate, cross_attention_weights, stop_logits = self.decode(encoded, mel, encoder_mask)
|
||||
return mel_output, mel_intermediate, encoder_attention_weights, cross_attention_weights
|
||||
|
||||
def encode(self, text):
|
||||
T_enc = text.shape[-1]
|
||||
embed = self.encoder_prenet(text)
|
||||
pe = self.encoder_pe[:T_enc, :] # (T, C)
|
||||
x = embed.scale(math.sqrt(self.d_model)) + pe.scale(self.pe_scalar)
|
||||
x = embed.scale(math.sqrt(self.d_encoder)) + pe * self.pe_scalar
|
||||
encoder_padding_mask = masking.id_mask(text, self.padding_idx, dtype=x.dtype)
|
||||
|
||||
x = F.dropout(x, training=self.training)
|
||||
|
@ -343,6 +353,11 @@ class TransformerTTS(nn.Layer):
|
|||
return decoder_output[:, 1:, :], encoder_attentions, cross_attention_weights
|
||||
|
||||
|
||||
class TransformerTTSLoss(nn.Layer):
|
||||
def __init__(self, stop_loss_scale):
|
||||
super(TransformerTTSLoss, self).__init__()
|
||||
self.stop_loss_scale = stop_loss_scale
|
||||
|
||||
def forward(self, ):
|
||||
|
||||
|
||||
return loss, details
|
|
@ -40,7 +40,7 @@ class CBHG(nn.Layer):
|
|||
proj_out_channels = projection_channels + \
|
||||
[in_channels] # ensure residual connection
|
||||
for c_in, c_out in zip(proj_in_channels, proj_out_channels):
|
||||
conv = nn.Conv1d(c_in, c_out, (3,), padding=(1, 1))
|
||||
conv = nn.Conv1D(c_in, c_out, (3,), padding=(1, 1))
|
||||
self.projections.append(conv)
|
||||
|
||||
if in_channels != highway_features:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
class Conv1dCell(nn.Conv1d):
|
||||
class Conv1dCell(nn.Conv1D):
|
||||
"""
|
||||
A subclass of Conv1d layer, which can be used like an RNN cell. It can take
|
||||
step input and return step output. It is done by keeping an internal buffer,
|
||||
|
@ -86,12 +86,12 @@ class Conv1dBatchNorm(nn.Layer):
|
|||
weight_attr=None, bias_attr=None):
|
||||
super(Conv1dBatchNorm, self).__init__()
|
||||
# TODO(chenfeiyu): carefully initialize Conv1d's weight
|
||||
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride,
|
||||
self.conv = nn.Conv1D(in_channels, out_channels, kernel_size, stride,
|
||||
padding=padding,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr)
|
||||
# TODO: channel last, but BatchNorm1d does not support channel last layout
|
||||
self.bn = nn.BatchNorm1d(out_channels)
|
||||
self.bn = nn.BatchNorm1D(out_channels)
|
||||
|
||||
def forward(self, x):
|
||||
return self.bn(self.conv(x))
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
def weighted_mean(input, weight):
|
||||
"""weighted mean.(It can also be used as masked mean.)
|
||||
|
||||
Args:
|
||||
input (Tensor): input tensor, floating point dtype.
|
||||
weight (Tensor): weight tensor with broadcastable shape.
|
||||
|
||||
Returns:
|
||||
Tensor: shape(1,), weighted mean tensor with the same dtype as input.
|
||||
"""
|
||||
weight = paddle.cast(weight, input.dtype)
|
||||
broadcast_factor = input.numel() / weight.numel()
|
||||
return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_factor)
|
||||
|
||||
def masked_l1_loss(prediction, target, mask):
|
||||
abs_error = F.l1_loss(prediction, target, reduction='none')
|
||||
return weighted_mean(abs_error, mask)
|
||||
|
||||
def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
|
||||
ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
|
||||
return weighted_mean(ce, mask)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
import unittest
|
||||
import paddle
|
||||
paddle.set_device("cpu")
|
||||
import numpy as np
|
||||
|
||||
from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy
|
||||
|
||||
class TestWeightedMean(unittest.TestCase):
|
||||
def test(self):
|
||||
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
|
||||
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
|
||||
loss = weighted_mean(x, mask)
|
||||
self.assertAlmostEqual(loss.numpy()[0], 7)
|
||||
|
||||
|
||||
class TestMaskedL1Loss(unittest.TestCase):
|
||||
def test(self):
|
||||
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
|
||||
y = paddle.zeros_like(x)
|
||||
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
|
||||
loss = masked_l1_loss(x, y, mask)
|
||||
print(loss)
|
||||
self.assertAlmostEqual(loss.numpy()[0], 7)
|
||||
|
||||
|
||||
class TestMaskedCrossEntropy(unittest.TestCase):
|
||||
def test(self):
|
||||
x = paddle.randn([3, 30, 8], dtype="float64")
|
||||
y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this
|
||||
mask = paddle.fluid.layers.sequence_mask(
|
||||
paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1)
|
||||
loss = masked_softmax_with_cross_entropy(x, y, mask)
|
||||
print(loss)
|
|
@ -63,8 +63,7 @@ class TestTransformerDecoderLayer(unittest.TestCase):
|
|||
class TestTransformerTTS(unittest.TestCase):
|
||||
def setUp(self):
|
||||
net = tts.TransformerTTS(
|
||||
128, 0, 64, 80, 4, 128,
|
||||
0.5,
|
||||
128, 0, 64, 128, 80, 4, 128,
|
||||
6, 6, 128, 128, 4,
|
||||
3, 10, 0.5)
|
||||
self.net = net
|
||||
|
|
Loading…
Reference in New Issue