1. API renaming Conv1d -> Conv1D, BatchNorm1d -> BatchNorm1D;

2. add losses in parakeet/modules;
3. fix a bug in phonetics;
4. TransformerTTS update: encoder dim can be different from decoder dim;
5. MultiHeadAttention in TransformerTTS: add k_input_dim & v_input_dim in __init__ to allow differemt feature sizes for k and v.
This commit is contained in:
chenfeiyu 2020-10-22 05:04:45 +00:00
parent 2a764d9a10
commit c43216ae9b
14 changed files with 230 additions and 305 deletions

3
.gitignore vendored
View File

@ -4,6 +4,9 @@
*.udb
*.ann
# data
datasets/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

View File

@ -12,4 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor
from .audio import AudioProcessor
from .spec_normalizer import NormalizerBase, LogMagnitude

View File

@ -15,278 +15,80 @@
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
class AudioProcessor(object):
def __init__(
self,
sample_rate=None, # int, sampling rate
num_mels=None, # int, bands of mel spectrogram
min_level_db=None, # float, minimum level db
ref_level_db=None, # float, reference level db
n_fft=None, # int: number of samples in a frame for stft
win_length=None, # int: the same meaning with n_fft
hop_length=None, # int: number of samples between neighboring frame
power=None, # float:power to raise before griffin-lim
preemphasis=None, # float: preemphasis coefficident
signal_norm=None, #
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
max_norm=None, # float, max norm
mel_fmin=None, # int: mel spectrogram's minimum frequency
mel_fmax=None, # int: mel spectrogram's maximum frequency
clip_norm=True, # bool: clip spectrogram's norm
griffin_lim_iters=None, # int:
do_trim_silence=False, # bool: trim silence
sound_norm=False,
**kwargs):
def __init__(self,
sample_rate:int,
n_fft:int,
win_length:int,
hop_length:int,
n_mels:int=80,
f_min:int=0,
f_max:int=None,
window="hann",
center="True",
pad_mode="reflect"):
# read & write
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db
self.ref_level_db = ref_level_db
# stft related
# stft
self.n_fft = n_fft
self.win_length = win_length or n_fft
# hop length defaults to 1/4 window_length
self.hop_length = hop_length or 0.25 * self.win_length
self.win_length = win_length
self.hop_length = hop_length
self.window = window
self.center = center
self.pad_mode = pad_mode
# mel
self.n_mels = n_mels
self.f_min = f_min
self.f_max = f_max
self.power = power
self.preemphasis = float(preemphasis)
self.mel_filter = self._create_mel_filter()
self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
def _create_mel_filter(self):
mel_filter = librosa.filters.mel(
self.sample_rate,
self.n_fft,
n_mels=self.n_mels,
fmin=self.f_min,
fmax=self.f_max)
return mel_filter
self.griffin_lim_iters = griffin_lim_iters
self.signal_norm = signal_norm
self.symmetric_norm = symmetric_norm
def read_wav(self, filename):
# resampling may occur
wav, _ = librosa.load(filename, sr=self.sample_rate)
return wav
# mel transform related
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
def write_wav(self, path, wav):
sf.write(path, wav, samplerate=self.sample_rate)
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
)
def _stft_parameters(self):
"""compute frame length and hop length in ms"""
frame_length_ms = self.win_length * 1. / self.sample_rate
frame_shift_ms = self.hop_length * 1. / self.sample_rate
num_freq = 1 + self.n_fft // 2
return num_freq, frame_length_ms, frame_shift_ms
def __repr__(self):
"""object repr"""
cls_name_str = self.__class__.__name__
members = vars(self)
dict_str = "\n".join(
[" {}: {},".format(k, v) for k, v in members.items()])
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
return repr_str
def save_wav(self, path, wav):
"""save audio with scipy.io.wavfile in 16bit integers"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate,
wav_norm.as_type(np.int16))
def load_wav(self, path, sr=None):
"""load wav -> trim_silence -> rescale"""
x, sr = librosa.load(path, sr=None)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
sr, self.sample_rate)
if self.do_trim_silence:
try:
x = self.trim_silence(x)
except ValueError:
print(" [!] File cannot be trimmed for silence - {}".format(
path))
if self.sound_norm:
x = x / x.max() * 0.9 # why 0.9 ?
return x
def trim_silence(self, wav):
"""Trim soilent parts with a threshold and 0.01s margin"""
margin = int(self.sample_rate * 0.01)
wav = wav[margin:-margin]
trimed_wav = librosa.effects.trim(
def stft(self, wav):
D = librosa.core.stft(
wav,
top_db=60,
frame_length=self.win_length,
hop_length=self.hop_length)[0]
return trimed_wav
def apply_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
def _amplitude_to_db(self, x):
amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(amplitude_min, x))
@staticmethod
def _db_to_amplitude(x):
return np.power(10., 0.05 * x)
def _linear_to_mel(self, spectrogram):
_mel_basis = self._build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _mel_to_linear(self, mel_spectrogram):
inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram))
def _build_mel_basis(self):
"""return mel basis for mel scale"""
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _normalize(self, S):
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
if self.signal_norm:
S_norm = (S - self.min_level_db) / (-self.min_level_db)
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
return S_norm
else:
S_norm = self.max_norm * S_norm
if self.clip_norm:
S_norm = np.clip(S_norm, 0, self.max_norm)
return S_norm
else:
return S
def _denormalize(self, S):
"""denormalize values"""
S_denorm = S
if self.signal_norm:
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = (S_denorm + self.max_norm) * (
-self.min_level_db) / (2 * self.max_norm
) + self.min_level_db
return S_denorm
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = S_denorm * (-self.min_level_db
) / self.max_norm + self.min_level_db
return S_denorm
else:
return S
def _stft(self, y):
return librosa.stft(
y=y,
n_fft=self.n_fft,
n_fft = self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
hop_length=self.hop_length)
window=self.window,
center=self.center,
pad_mode=self.pad_mode)
return D
def _istft(self, S):
return librosa.istft(
S, hop_length=self.hop_length, win_length=self.win_length)
def istft(self, D):
wav = librosa.core.istft(
D,
hop_length=self.hop_length,
win_length=self.win_length,
window=self.window,
center=self.center)
return wav
def spectrogram(self, y):
"""compute linear spectrogram(amplitude)
preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize
"""
if self.preemphasis:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db
return self._normalize(S)
def spectrogram(self, wav):
D = self.stft(wav)
return np.abs(D)
def melspectrogram(self, y):
"""compute linear spectrogram(amplitude)
preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize
"""
if self.preemphasis:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amplitude_to_db(self._linear_to_mel(np.abs(
D))) - self.ref_level_db
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
"""convert spectrogram back to waveform using griffin_lim in librosa"""
S = self._denormalize(spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram):
S = self._denormalize(mel_spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
S = self._mel_to_linear(np.abs(S))
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
"""convert output linear spec to mel spec"""
S = self._denormalize(linear_spec)
S = self._db_to_amplitude(S + self.ref_level_db)
S = self._linear_to_mel(np.abs(S))
S = self._amplitude_to_db(S) - self.ref_level_db
mel = self._normalize(S)
def mel_spectrogram(self, wav):
S = self.spectrogram(wav)
mel = np.dot(self.mel_filter, S)
return mel
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = self._istft(S_complex * angles)
for _ in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
return y
@staticmethod
def mulaw_encode(wav, qc):
mu = 2**qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal, )
@staticmethod
def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values."""
mu = 2**qc - 1
x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
return x
@staticmethod
def encode_16bits(x):
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
@staticmethod
def quantize(x, bits):
return (x + 1.) * (2**bits - 1) / 2
@staticmethod
def dequantize(x, bits):
return 2 * x / (2**bits - 1) - 1

View File

@ -7,6 +7,9 @@ the generated spectrogram so as to be used with vocoders like griffin lim.
The base class describe the interface. `transform` is used to perform
transformation and `inverse` is used to perform the inverse transformation.
check issues:
https://github.com/mozilla/TTS/issues/377
"""
import numpy as np
@ -18,6 +21,9 @@ class NormalizerBase(object):
raise NotImplementedError("inverse must be implemented")
class LogMagnitude(NormalizerBase):
"""
This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
"""
def __init__(self, min=1e-7):
self.min = min
@ -28,7 +34,11 @@ class LogMagnitude(NormalizerBase):
def inverse(self, x):
return np.exp(x)
class UnitMagnitude(NormalizerBase):
# dbscale and (0, 1) normalization
"""
This is the normalizer used in the
"""
pass

View File

@ -1,7 +1,7 @@
from paddle.io import Dataset
from os import listdir
from os.path import splitext, join
from pathlib import Path
import librosa
class AudioFolderDataset(Dataset):
@ -19,4 +19,26 @@ class AudioFolderDataset(Dataset):
def __getitem__(self, i):
file_name = self.file_names[i]
y, sr = librosa.load(file_name, sr=self.sample_rate) # pylint: disable=unused-variable
return y
return y
class LJSpeechMetaData(Dataset):
def __init__(self, root):
self.root = Path(root).expanduser()
wav_dir = self.root / "wavs"
csv_path = self.root / "metadata.csv"
records = []
speaker_name = "ljspeech"
with open(str(csv_path), 'rt') as f:
for line in f:
filename, _, normalized_text = line.strip().split("|")
filename = str(wav_dir / (filename + ".wav"))
records.append([filename, normalized_text, speaker_name])
self.records = records
def __getitem__(self, i):
return self.records[i]
def __len__(self):
return len(self.records)

View File

@ -27,7 +27,12 @@ class English(Phonetics):
self.vocab = Vocab(self.phonemes + self.punctuations)
def phoneticize(self, sentence):
return self.backend(sentence)
start = self.vocab.start_symbol
end = self.vocab.end_symbol
phonemes = ([] if start is None else [start]) \
+ self.backend(sentence) \
+ ([] if end is None else [end])
return phonemes
def numericalize(self, phonemes):
ids = [self.vocab.lookup(item) for item in phonemes if item in self.vocab.stoi]
@ -58,6 +63,11 @@ class Chinese(Phonetics):
def phoneticize(self, sentence):
simplified = self.opencc_backend.convert(sentence)
phonemes = self.backend(simplified)
start = self.vocab.start_symbol
end = self.vocab.end_symbol
phonemes = ([] if start is None else [start]) \
+ phonemes \
+ ([] if end is None else [end])
return self._filter_symbols(phonemes)
def _filter_symbols(self, phonemes):

View File

@ -22,11 +22,10 @@ class Vocab(object):
self.stoi = OrderedDict()
self.stoi.update(self.special_symbols)
N = len(self.special_symbols)
for i, s in enumerate(symbols):
if s not in self.stoi:
self.stoi[s] = N +i
self.stoi[s] = len(self.stoi)
self.itos = {v: k for k, v in self.stoi.items()}
def __len__(self):

View File

@ -21,7 +21,7 @@ class ConvBlock(nn.Layer):
std = math.sqrt(4 * keep_prob / (kernel_size * in_channel))
padding = "valid" if causal else "same"
conv = nn.Conv1d(in_channel, 2 * in_channel, (kernel_size, ),
conv = nn.Conv1D(in_channel, 2 * in_channel, (kernel_size, ),
padding=padding,
data_format="NLC",
weight_attr=I.Normal(scale=std))

View File

@ -2,11 +2,12 @@ import math
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from parakeet.modules.attention import _split_heads, _concat_heads, drop_head, scaled_dot_product_attention
from parakeet.modules.transformer import PositionwiseFFN
from parakeet.modules import masking
from parakeet.modules.cbhg import Conv1dBatchNorm
from parakeet.modules.conv import Conv1dBatchNorm
from parakeet.modules import positional_encoding as pe
__all__ = ["TransformerTTS"]
@ -21,7 +22,7 @@ class MultiheadAttention(nn.Layer):
Another deviation is that it concats the input query and context vector before
applying the output projection.
"""
def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None):
def __init__(self, model_dim, num_heads, k_dim=None, v_dim=None, k_input_dim=None, v_input_dim=None):
"""
Args:
model_dim (int): the feature size of query.
@ -42,9 +43,11 @@ class MultiheadAttention(nn.Layer):
depth = model_dim // num_heads
k_dim = k_dim or depth
v_dim = v_dim or depth
k_input_dim = k_input_dim or model_dim
v_input_dim = v_input_dim or model_dim
self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
self.affine_k = nn.Linear(k_input_dim, num_heads * k_dim)
self.affine_v = nn.Linear(v_input_dim, num_heads * v_dim)
self.affine_o = nn.Linear(model_dim + num_heads * v_dim, model_dim)
self.num_heads = num_heads
@ -128,7 +131,7 @@ class TransformerDecoderLayer(nn.Layer):
"""
Transformer decoder layer.
"""
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
def __init__(self, d_model, n_heads, d_ffn, dropout=0., d_encoder=None):
"""
Args:
d_model (int): the feature size of the input, and the output.
@ -141,7 +144,7 @@ class TransformerDecoderLayer(nn.Layer):
self.self_mha = MultiheadAttention(d_model, n_heads)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
self.cross_mha = MultiheadAttention(d_model, n_heads)
self.cross_mha = MultiheadAttention(d_model, n_heads, k_input_dim=d_encoder, v_input_dim=d_encoder)
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
@ -194,10 +197,10 @@ class TransformerEncoder(nn.LayerList):
class TransformerDecoder(nn.LayerList):
def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0.):
def __init__(self, d_model, n_heads, d_ffn, n_layers, dropout=0., d_encoder=None):
super(TransformerDecoder, self).__init__()
for _ in range(n_layers):
self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout))
self.append(TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout, d_encoder=d_encoder))
def forward(self, q, k, v, encoder_mask, decoder_mask):
self_attention_weights = []
@ -233,7 +236,7 @@ class CNNPostNet(nn.Layer):
c_out = d_output if i == n_layers - 1 else d_hidden
self.convs.append(
Conv1dBatchNorm(c_in, c_out, kernel_size, padding=padding))
self.last_norm = nn.BatchNorm1d(d_output)
self.last_norm = nn.BatchNorm1D(d_output)
def forward(self, x):
x_in = x
@ -244,44 +247,51 @@ class CNNPostNet(nn.Layer):
class TransformerTTS(nn.Layer):
def __init__(self, vocab_size, padding_idx, d_model, d_mel, n_heads, d_ffn, positional_encoding_scalar,
def __init__(self, vocab_size, padding_idx, d_encoder, d_decoder, d_mel, n_heads, d_ffn,
encoder_layers, decoder_layers, d_prenet, d_postnet, postnet_layers,
postnet_kernel_size, max_reduction_factor, dropout):
super(TransformerTTS, self).__init__()
self.encoder_prenet = nn.Embedding(vocab_size, d_model, padding_idx)
self.encoder_pe = pe.positional_encoding(0, 1000, d_model) # it may be extended later
self.encoder = TransformerEncoder(d_model, n_heads, d_ffn, encoder_layers, dropout)
# initial pe scalar is 1, though it is trainable
self.pe_scalar = self.create_parameter([1], attr=I.Constant(1.))
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_model, dropout)
self.decoder_pe = pe.positional_encoding(0, 1000, d_model) # it may be extended later
self.decoder = TransformerDecoder(d_model, n_heads, d_ffn, decoder_layers, dropout)
self.final_proj = nn.Linear(d_model, max_reduction_factor * d_mel)
# encoder
self.encoder_prenet = nn.Embedding(vocab_size, d_encoder, padding_idx)
self.encoder_pe = pe.positional_encoding(0, 1000, d_encoder) # it may be extended later
self.encoder = TransformerEncoder(d_encoder, n_heads, d_ffn, encoder_layers, dropout)
# decoder
self.decoder_prenet = MLPPreNet(d_mel, d_prenet, d_decoder, dropout)
self.decoder_pe = pe.positional_encoding(0, 1000, d_decoder) # it may be extended later
self.decoder = TransformerDecoder(d_decoder, n_heads, d_ffn, decoder_layers, dropout, d_encoder=d_encoder)
self.final_proj = nn.Linear(d_decoder, max_reduction_factor * d_mel)
self.decoder_postnet = CNNPostNet(d_mel, d_postnet, d_mel, postnet_kernel_size, postnet_layers)
self.stop_conditioner = nn.Linear(d_mel, 3)
# specs
self.padding_idx = padding_idx
self.d_model = d_model
self.pe_scalar = positional_encoding_scalar
self.d_encoder = d_encoder
self.d_decoder = d_decoder
# start and end
# start and end: though it is only used in predict
# it can also be used in training
dtype = paddle.get_default_dtype()
self.start_vec = paddle.fill_constant([1, d_mel], dtype=dtype, value=0)
self.end_vec = paddle.fill_constant([1, d_mel], dtype=dtype, value=0)
self.start_vec = paddle.full([1, d_mel], 0, dtype=dtype)
self.end_vec = paddle.full([1, d_mel], 0, dtype=dtype)
self.stop_prob_index = 2
self.max_r = max_reduction_factor
self.r = max_reduction_factor # set it every call
def forward(self, text, mel, stop):
pass
encoded, encoder_attention_weights, encoder_mask = self.encode(text)
mel_output, mel_intermediate, cross_attention_weights, stop_logits = self.decode(encoded, mel, encoder_mask)
return mel_output, mel_intermediate, encoder_attention_weights, cross_attention_weights
def encode(self, text):
T_enc = text.shape[-1]
embed = self.encoder_prenet(text)
pe = self.encoder_pe[:T_enc, :] # (T, C)
x = embed.scale(math.sqrt(self.d_model)) + pe.scale(self.pe_scalar)
x = embed.scale(math.sqrt(self.d_encoder)) + pe * self.pe_scalar
encoder_padding_mask = masking.id_mask(text, self.padding_idx, dtype=x.dtype)
x = F.dropout(x, training=self.training)
@ -341,8 +351,13 @@ class TransformerTTS(nn.Layer):
break
return decoder_output[:, 1:, :], encoder_attentions, cross_attention_weights
class TransformerTTSLoss(nn.Layer):
def __init__(self, stop_loss_scale):
super(TransformerTTSLoss, self).__init__()
self.stop_loss_scale = stop_loss_scale
def forward(self, ):
return loss, details

View File

@ -40,7 +40,7 @@ class CBHG(nn.Layer):
proj_out_channels = projection_channels + \
[in_channels] # ensure residual connection
for c_in, c_out in zip(proj_in_channels, proj_out_channels):
conv = nn.Conv1d(c_in, c_out, (3,), padding=(1, 1))
conv = nn.Conv1D(c_in, c_out, (3,), padding=(1, 1))
self.projections.append(conv)
if in_channels != highway_features:

View File

@ -1,7 +1,7 @@
import paddle
from paddle import nn
class Conv1dCell(nn.Conv1d):
class Conv1dCell(nn.Conv1D):
"""
A subclass of Conv1d layer, which can be used like an RNN cell. It can take
step input and return step output. It is done by keeping an internal buffer,
@ -86,12 +86,12 @@ class Conv1dBatchNorm(nn.Layer):
weight_attr=None, bias_attr=None):
super(Conv1dBatchNorm, self).__init__()
# TODO(chenfeiyu): carefully initialize Conv1d's weight
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride,
self.conv = nn.Conv1D(in_channels, out_channels, kernel_size, stride,
padding=padding,
weight_attr=weight_attr,
bias_attr=bias_attr)
# TODO: channel last, but BatchNorm1d does not support channel last layout
self.bn = nn.BatchNorm1d(out_channels)
self.bn = nn.BatchNorm1D(out_channels)
def forward(self, x):
return self.bn(self.conv(x))

View File

@ -0,0 +1,31 @@
import paddle
from paddle import nn
from paddle.nn import functional as F
def weighted_mean(input, weight):
"""weighted mean.(It can also be used as masked mean.)
Args:
input (Tensor): input tensor, floating point dtype.
weight (Tensor): weight tensor with broadcastable shape.
Returns:
Tensor: shape(1,), weighted mean tensor with the same dtype as input.
"""
weight = paddle.cast(weight, input.dtype)
broadcast_factor = input.numel() / weight.numel()
return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_factor)
def masked_l1_loss(prediction, target, mask):
abs_error = F.l1_loss(prediction, target, reduction='none')
return weighted_mean(abs_error, mask)
def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
return weighted_mean(ce, mask)

33
tests/test_losses.py Normal file
View File

@ -0,0 +1,33 @@
import unittest
import paddle
paddle.set_device("cpu")
import numpy as np
from parakeet.modules.losses import weighted_mean, masked_l1_loss, masked_softmax_with_cross_entropy
class TestWeightedMean(unittest.TestCase):
def test(self):
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
loss = weighted_mean(x, mask)
self.assertAlmostEqual(loss.numpy()[0], 7)
class TestMaskedL1Loss(unittest.TestCase):
def test(self):
x = paddle.arange(0, 10, dtype="float64").unsqueeze(-1).broadcast_to([10, 3])
y = paddle.zeros_like(x)
mask = (paddle.arange(0, 10, dtype="float64") > 4).unsqueeze(-1)
loss = masked_l1_loss(x, y, mask)
print(loss)
self.assertAlmostEqual(loss.numpy()[0], 7)
class TestMaskedCrossEntropy(unittest.TestCase):
def test(self):
x = paddle.randn([3, 30, 8], dtype="float64")
y = paddle.randint(0, 8, [3, 30], dtype="int64").unsqueeze(-1) # mind this
mask = paddle.fluid.layers.sequence_mask(
paddle.to_tensor([30, 18, 27]), dtype="int64").unsqueeze(-1)
loss = masked_softmax_with_cross_entropy(x, y, mask)
print(loss)

View File

@ -63,8 +63,7 @@ class TestTransformerDecoderLayer(unittest.TestCase):
class TestTransformerTTS(unittest.TestCase):
def setUp(self):
net = tts.TransformerTTS(
128, 0, 64, 80, 4, 128,
0.5,
128, 0, 64, 128, 80, 4, 128,
6, 6, 128, 128, 4,
3, 10, 0.5)
self.net = net