Merge branch 'add_TranTTS' into 'master'

Add tran tts

See merge request !1
This commit is contained in:
lifuchen 2020-01-08 12:00:18 +08:00
commit ca5d57c1c7
42 changed files with 4093 additions and 6 deletions

View File

@ -0,0 +1 @@
from .audio import AudioProcessor

261
parakeet/audio/audio.py Normal file
View File

@ -0,0 +1,261 @@
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
class AudioProcessor(object):
def __init__(self,
sample_rate=None, # int, sampling rate
num_mels=None, # int, bands of mel spectrogram
min_level_db=None, # float, minimum level db
ref_level_db=None, # float, reference level db
n_fft=None, # int: number of samples in a frame for stft
win_length=None, # int: the same meaning with n_fft
hop_length=None, # int: number of samples between neighboring frame
power=None, # float:power to raise before griffin-lim
preemphasis=None, # float: preemphasis coefficident
signal_norm=None, #
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
max_norm=None, # float, max norm
mel_fmin=None, # int: mel spectrogram's minimum frequency
mel_fmax=None, # int: mel spectrogram's maximum frequency
clip_norm=True, # bool: clip spectrogram's norm
griffin_lim_iters=None, # int:
do_trim_silence=False, # bool: trim silence
sound_norm=False,
**kwargs):
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db
self.ref_level_db = ref_level_db
# stft related
self.n_fft = n_fft
self.win_length = win_length or n_fft
# hop length defaults to 1/4 window_length
self.hop_length = hop_length or 0.25 * self.win_length
self.power = power
self.preemphasis = float(preemphasis)
self.griffin_lim_iters = griffin_lim_iters
self.signal_norm = signal_norm
self.symmetric_norm = symmetric_norm
# mel transform related
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
def _stft_parameters(self):
"""compute frame length and hop length in ms"""
frame_length_ms = self.win_length * 1. / self.sample_rate
frame_shift_ms = self.hop_length * 1. / self.sample_rate
num_freq = 1 + self.n_fft // 2
return num_freq, frame_length_ms, frame_shift_ms
def __repr__(self):
"""object repr"""
cls_name_str = self.__class__.__name__
members = vars(self)
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()])
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
return repr_str
def save_wav(self, path, wav):
"""save audio with scipy.io.wavfile in 16bit integers"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
def load_wav(self, path, sr=None):
"""load wav -> trim_silence -> rescale"""
x, sr = librosa.load(path, sr=None)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
if self.do_trim_silence:
try:
x = self.trim_silence(x)
except ValueError:
print(" [!] File cannot be trimmed for silence - {}".format(path))
if self.sound_norm:
x = x / x.max() * 0.9 # why 0.9 ?
return x
def trim_silence(self, wav):
"""Trim soilent parts with a threshold and 0.01s margin"""
margin = int(self.sample_rate * 0.01)
wav = wav[margin: -margin]
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
return trimed_wav
def apply_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
def _amplitude_to_db(self, x):
amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(amplitude_min, x))
@staticmethod
def _db_to_amplitude(x):
return np.power(10., 0.05 * x)
def _linear_to_mel(self, spectrogram):
_mel_basis = self._build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _mel_to_linear(self, mel_spectrogram):
inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram))
def _build_mel_basis(self):
"""return mel basis for mel scale"""
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _normalize(self, S):
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
if self.signal_norm:
S_norm = (S - self.min_level_db) / (-self.min_level_db)
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
return S_norm
else:
S_norm = self.max_norm * S_norm
if self.clip_norm:
S_norm = np.clip(S_norm, 0, self.max_norm)
return S_norm
else:
return S
def _denormalize(self, S):
"""denormalize values"""
S_denorm = S
if self.signal_norm:
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
return S_denorm
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
return S_denorm
else:
return S
def _stft(self, y):
return librosa.stft(
y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
def _istft(self, S):
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
def spectrogram(self, y):
"""compute linear spectrogram(amplitude)
preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize
"""
if self.preemphasis:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db
return self._normalize(S)
def melspectrogram(self, y):
"""compute linear spectrogram(amplitude)
preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize
"""
if self.preemphasis:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
"""convert spectrogram back to waveform using griffin_lim in librosa"""
S = self._denormalize(spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
def inv_melspectrogram(self, mel_spectrogram):
S = self._denormalize(mel_spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
S = self._linear_to_mel(np.abs(S))
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
def out_linear_to_mel(self, linear_spec):
"""convert output linear spec to mel spec"""
S = self._denormalize(linear_spec)
S = self._db_to_amplitude(S + self.ref_level_db)
S = self._linear_to_mel(np.abs(S))
S = self._amplitude_to_db(S) - self.ref_level_db
mel = self._normalize(S)
return mel
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = self._istft(S_complex * angles)
for _ in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
return y
@staticmethod
def mulaw_encode(wav, qc):
mu = 2 ** qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,)
@staticmethod
def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values."""
mu = 2 ** qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
return x
@staticmethod
def encode_16bits(x):
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
@staticmethod
def quantize(x, bits):
return (x + 1.) * (2**bits - 1) / 2
@staticmethod
def dequantize(x, bits):
return 2 * x / (2**bits - 1) - 1

View File

@ -2,7 +2,8 @@ from .sampler import SequentialSampler, RandomSampler, BatchSampler
class DataCargo(object):
def __init__(self, dataset, batch_size=1, sampler=None,
shuffle=False, batch_sampler=None, drop_last=False):
shuffle=False, batch_sampler=None, collate_fn=None,
drop_last=False):
self.dataset = dataset
if batch_sampler is not None:
@ -21,13 +22,20 @@ class DataCargo(object):
sampler = RandomSampler(dataset)
else:
sampler = SequentialSampler(dataset)
# auto_collation without custom batch_sampler
batch_sampler = BatchSampler(sampler, batch_size, drop_last)
else:
batch_sampler = BatchSampler(sampler, batch_size, drop_last)
self.batch_sampler = batch_sampler
if collate_fn is None:
collate_fn = dataset._batch_examples
self.collate_fn = collate_fn
self.batch_size = batch_size
self.drop_last = drop_last
self.sampler = sampler
self.batch_sampler = batch_sampler
def __iter__(self):
return DataIterator(self)
@ -57,6 +65,7 @@ class DataIterator(object):
self._index_sampler = loader._index_sampler
self._sampler_iter = iter(self._index_sampler)
self.collate_fn = loader.collate_fn
def __iter__(self):
return self
@ -64,7 +73,7 @@ class DataIterator(object):
def __next__(self):
index = self._next_index() # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size
minibatch = [self._dataset[i] for i in index] # we can abstract it, too to use dynamic batch size
minibatch = self._dataset._batch_examples(minibatch) # list[Example] -> Batch
minibatch = self.collate_fn(minibatch)
return minibatch
def _next_index(self):

View File

View File

@ -0,0 +1,148 @@
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from paddle import fluid
from parakeet import g2p
from parakeet import audio
from parakeet.data.sampler import *
from parakeet.data.datacargo import DataCargo
from parakeet.data.dataset import Dataset
from parakeet.data.batch import TextIDBatcher, SpecBatcher
class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path)
dataset = LJSpeech(LJSPEECH_ROOT, config)
sampler = DistributedSampler(len(dataset), nranks, rank, shuffle=shuffle)
assert config.batch_size % nranks == 0
each_bs = config.batch_size // nranks
if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples_vocoder, drop_last=True)
else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples, drop_last=True)
self.reader = fluid.io.DataLoader.from_generator(
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
self.reader.set_batch_generator(dataloader, place)
class LJSpeech(Dataset):
def __init__(self, root, config):
super(LJSpeech, self).__init__()
assert isinstance(root, (str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.metadata = self._prepare_metadata()
self.config = config
self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config.audio.sr,
num_mels=config.audio.num_mels,
min_level_db=config.audio.min_level_db,
ref_level_db=config.audio.ref_level_db,
n_fft=config.audio.n_fft,
win_length= config.audio.win_length,
hop_length= config.audio.hop_length,
power=config.audio.power,
preemphasis=config.audio.preemphasis,
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
mel_fmin=0,
mel_fmax=None,
clip_norm=True,
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
def _prepare_metadata(self):
csv_path = self.root.joinpath("metadata.csv")
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
names=["fname", "raw_text", "normalized_text"])
return metadata
def _get_example(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname, raw_text, normalized_text = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav")
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = self._ljspeech_processor.load_wav(str(wav_path))
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def __getitem__(self, index):
metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum)
return example
def __iter__(self):
for i in range(len(self)):
yield self[i]
def __len__(self):
return len(self.metadata)
def batch_examples(batch):
texts = []
mels = []
mel_inputs = []
text_lens = []
pos_texts = []
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
def batch_examples_vocoder(batch):
mels=[]
mags=[]
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
return (mels, mags)

View File

View File

@ -0,0 +1,41 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
encoder_output_size: 384
word_vec_dim: 384
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
decoder_output_size: 384
d_model: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
image_step: 2000
use_gpu: False
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ./checkpoint
transformer_step: 70000
log_dir: ./log

View File

@ -0,0 +1,43 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
encoder_n_layer: 6
encoder_head: 2
encoder_conv1d_filter_size: 1536
max_sep_len: 2048
encoder_output_size: 384
embedding_size: 384
decoder_n_layer: 6
decoder_head: 2
decoder_conv1d_filter_size: 1536
decoder_output_size: 384
hidden_size: 384
duration_predictor_output_size: 256
duration_predictor_filter_size: 3
fft_conv1d_filter: 3
fft_conv1d_padding: 1
dropout: 0.1
transformer_head: 4
warm_up_step: 4000
grad_clip_thresh: 0.1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint
transformer_step: 20
log_dir: ./log

View File

@ -0,0 +1,150 @@
import numpy as np
import math
import utils
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
"""
Feed Forward Transformer block in FastSpeech.
Args:
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
T means the timesteps of input.
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
len_q means the sequence length of query, len_k means the sequence length of key.
Returns:
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
"""
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask
output = self.pos_ffn(output)
output *= non_pad_mask
return output, slf_attn
class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor(input_size=input_size,
out_channels=out_channels,
filter_size=filter_size,
dropout=dropout)
def LR(self, x, duration_predictor_output, alpha=1.0):
output = []
batch_size = x.shape[0]
for i in range(batch_size):
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
output = self.pad(output)
return output
def pad(self, input_ele):
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
out_list = []
for i in range(len(input_ele)):
pad_len = max_len - input_ele[i].shape[0]
one_batch_padded = layers.pad(
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
out_list.append(one_batch_padded)
out_padded = layers.stack(out_list)
return out_padded
def expand(self, batch, predicted, alpha):
out = []
time_steps = batch.shape[1]
fertilities = predicted.numpy()
batch = layers.squeeze(batch,[0])
for i in range(time_steps):
if fertilities[0,i]==0:
continue
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
out = layers.concat(out, axis=0)
return out
def forward(self, x, alpha=1.0, target=None):
"""
Length Regulator block in FastSpeech.
Args:
x (Variable): Shape(B, T, C), dtype: float32. The encoder output.
alpha (Constant): dtype: float32. The hyperparameter to determine the length of
the expanded sequence mel, thereby controlling the voice speed.
target (Variable): (Variable, optional): Shape(B, T_text),
dtype: int64. The duration of phoneme compute from pretrained transformerTTS.
Returns:
output (Variable), Shape(B, T, C), the output after exppand.
duration_predictor_output (Variable), Shape(B, T, C), the output of duration predictor.
"""
duration_predictor_output = self.duration_predictor(x)
if fluid.framework._dygraph_tracer()._train_mode:
output = self.LR(x, target)
return output, duration_predictor_output
else:
duration_predictor_output = layers.round(duration_predictor_output)
output = self.LR(x, duration_predictor_output, alpha)
mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])])
return output, mel_pos
class DurationPredictor(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(DurationPredictor, self).__init__()
self.input_size = input_size
self.out_channels = out_channels
self.filter_size = filter_size
self.dropout = dropout
self.conv1 = Conv1D(in_channels = self.input_size,
out_channels = self.out_channels,
filter_size = self.filter_size,
padding=1,
data_format='NTC')
self.conv2 = Conv1D(in_channels = self.out_channels,
out_channels = self.out_channels,
filter_size = self.filter_size,
padding=1,
data_format='NTC')
self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.linear =dg.Linear(self.out_channels, 1)
def forward(self, encoder_output):
"""
Duration Predictor block in FastSpeech.
Args:
encoder_output (Variable): Shape(B, T, C), dtype: float32. The encoder output.
Returns:
out (Variable), Shape(B, T, C), the output of duration predictor.
"""
# encoder_output.shape(N, T, C)
out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout)
out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout)
out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1])
return out

View File

@ -0,0 +1,214 @@
from utils import *
from modules import *
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols
from parakeet.modules.utils import *
from parakeet.modules.post_convnet import PostConvNet
class Encoder(dg.Layer):
def __init__(self,
n_src_vocab,
len_max_seq,
d_word_vec,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, character, text_pos):
"""
Encoder layer of FastSpeech.
Args:
character (Variable): Shape(B, T_text), dtype: float32. The input text
characters. T_text means the timesteps of input characters.
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
position. T_text means the timesteps of input characters.
Returns:
enc_output (Variable), Shape(B, text_T, C), the encoder output.
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
"""
enc_slf_attn_list = []
# -- prepare masks
# shape character (N, T)
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
non_pad_mask = get_non_pad_mask(character)
# -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list
class Decoder(dg.Layer):
def __init__(self,
len_max_seq,
d_word_vec,
n_layers,
n_head,
d_k,
d_v,
d_model,
d_inner,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=0.1):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos):
"""
Decoder layer of FastSpeech.
Args:
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
The output of length regulator.
enc_pos (Variable, optional): Shape(B, T_mel),
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
Returns:
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
dec_slf_attn_list = []
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
non_pad_mask = get_non_pad_mask(enc_pos)
# -- Forward
dec_output = enc_seq + self.position_enc(enc_pos)
for dec_layer in self.layer_stack:
dec_output, dec_slf_attn = dec_layer(
dec_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list
class FastSpeech(dg.Layer):
def __init__(self, cfg):
" FastSpeech"
super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size,
n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head,
d_k=64,
d_v=64,
d_model=cfg.hidden_size,
d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size,
out_channels=cfg.duration_predictor_output_size,
filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size,
n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head,
d_k=64,
d_v=64,
d_model=cfg.hidden_size,
d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1)
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels)
self.postnet = PostConvNet(n_mels=80,
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=1,
use_cudnn=True,
dropout=0.1)
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
"""
FastSpeech model.
Args:
character (Variable): Shape(B, T_text), dtype: float32. The input text
characters. T_text means the timesteps of input characters.
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
position. T_text means the timesteps of input characters.
mel_pos (Variable, optional): Shape(B, T_mel),
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
length_target (Variable, optional): Shape(B, T_text),
dtype: int64. The duration of phoneme compute from pretrained transformerTTS.
alpha (Constant):
dtype: float32. The hyperparameter to determine the length of the expanded sequence
mel, thereby controlling the voice speed.
Returns:
mel_output (Variable), Shape(B, mel_T, C), the mel output before postnet.
mel_output_postnet (Variable), Shape(B, mel_T, C), the mel output after postnet.
duration_predictor_output (Variable), Shape(B, text_T), the duration of phoneme compute
with duration predictor.
enc_slf_attn_list (Variable), Shape(B, text_T, text_T), the encoder self attention list.
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
target=length_target,
alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
else:
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
decoder_output = self.decoder(length_regulator_output, decoder_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet

View File

@ -0,0 +1,93 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding.")
parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2,
help="the attention head number in encoder.")
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.")
parser.add_argument('--encoder_output_size', type=int, default=256,
help="the output channel size of encoder.")
parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.")
parser.add_argument('--decoder_output_size', type=int, default=256,
help="the output channel size of decoder.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
help="the filter size of conv1d in duration prediction.")
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
help="the filter size of conv1d in fft.")
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
help="the padding size of conv1d in fft.")
parser.add_argument('--dropout', type=float, default=0.1,
help="the dropout in network.")
parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=70000,
help="the step to load transformerTTS model.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -0,0 +1,139 @@
import numpy as np
import argparse
import os
import time
import math
import jsonargparse
from pathlib import Path
from tqdm import tqdm
from tensorboardX import SummaryWriter
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parse import add_config_options_to_parser
from pprint import pprint
from network import FastSpeech
from utils import get_alignment
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
from parakeet.models.transformerTTS.network import TransformerTTS
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main(cfg):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
transformerTTS = TransformerTTS(cfg)
model_path = os.path.join(cfg.transtts_path, "transformer")
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
#for param in transformerTTS.state_dict():
# print(param)
transformerTTS.set_dict(model_dict)
transformerTTS.eval()
model = FastSpeech(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
global_step += 1
#Forward
result= model(character,
pos_text,
mel_pos=pos_mel,
length_target=alignment)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0:
print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if cfg.use_data_parallel:
total_loss = model.scale_loss(total_loss)
total_loss.backward()
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
writer.close()
if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c config/fastspeech.yaml'.split())
main(cfg)

View File

@ -0,0 +1,32 @@
import numpy as np
def get_alignment(attn_probs, n_head):
max_F = 0
assert attn_probs[0].shape[0] % n_head == 0
batch_size = int(attn_probs[0].shape[0] // n_head)
for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy()
for j in range(n_head):
attn = multi_attn[j*batch_size:(j+1)*batch_size]
F = score_F(attn)
if max_F < F:
max_F = F
max_attn = attn
alignment = compute_duration(max_attn)
return alignment
def score_F(attn):
max = np.max(attn, axis=-1)
mean = np.mean(max)
return mean
def compute_duration(attn):
alignment = np.zeros([attn.shape[0],attn.shape[2]])
for i in range(attn.shape[0]):
for j in range(attn.shape[1]):
max_index = attn[i,j].tolist().index(attn[i,j].max())
alignment[i,max_index] += 1
return alignment

View File

@ -0,0 +1,20 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
max_len: 50
transformer_step: 1
postnet_step: 1
use_gpu: True
checkpoint_path: ./checkpoint
log_dir: ./log
sample_path: ./sample

View File

@ -0,0 +1,27 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: True
data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log

View File

@ -0,0 +1,33 @@
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 1000
image_step: 2000
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint/transformer/1

View File

@ -0,0 +1,166 @@
import math
import numpy as np
import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Conv1D(dg.Layer):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def __init__(self,
in_channels,
num_filters,
filter_size=3,
padding=0,
dilation=1,
stride=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
data_format='NCT',
dtype="float32"):
super(Conv1D, self).__init__(dtype=dtype)
self.padding = padding
self.in_channels = in_channels
self.num_filters = num_filters
self.filter_size = filter_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.act = act
self.data_format = data_format
self.conv = dg.Conv2D(
in_channels=in_channels,
num_filters=num_filters,
filter_size=(1, filter_size),
stride=(1, stride),
dilation=(1, dilation),
padding=(0, padding),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.conv(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
pool_size=-1,
pool_type='max',
pool_stride=1,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
exclusive=True,
data_format='NCT',
dtype='float32'):
super(Pool1D, self).__init__(dtype=dtype)
self.pool_size = pool_size
self.pool_type = pool_type
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.global_pooling = global_pooling
self.use_cudnn = use_cudnn
self.ceil_mode = ceil_mode
self.exclusive = exclusive
self.data_format = data_format
self.dtype = dtype
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.pool2d(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class DynamicGRU(dg.Layer):
def __init__(self,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__()
self.gru_unit = dg.GRUUnit(
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res

View File

@ -0,0 +1,242 @@
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Pool1D
from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np
class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
param_attr = fluid.ParamAttr(name='weight'),
padding_idx = None)
self.conv_list = []
self.conv_list.append(Conv(in_channels = embedding_size,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT"))
for _ in range(2):
self.conv_list.append(Conv(in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.projection = dg.Linear(num_hidden, num_hidden)
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = self.projection(x)
return x
class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False):
super(CBHG, self).__init__()
"""
:param hidden_size: dimension of hidden unit
:param K: # of convolution banks
:param projection_size: dimension of projection unit
:param num_gru_layers: # of layers of GRUcell
:param max_pool_kernel_size: max pooling kernel size
:param is_post: whether post processing or not
"""
self.hidden_size = hidden_size
self.projection_size = projection_size
self.conv_list = []
self.conv_list.append(Conv(in_channels = projection_size,
out_channels = hidden_size,
filter_size = 1,
padding = int(np.floor(1/2)),
data_format = "NCT"))
for i in range(2,K+1):
self.conv_list.append(Conv(in_channels = hidden_size,
out_channels = hidden_size,
filter_size = i,
padding = int(np.floor(i/2)),
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
conv_outdim = hidden_size * K
self.conv_projection_1 = Conv(in_channels = conv_outdim,
out_channels = hidden_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.conv_projection_2 = Conv(in_channels = hidden_size,
out_channels = projection_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format = "NCT")
self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True,
origin_mode=True,
h_0 = h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:,:,:-1]
else:
return x
def forward(self, input_):
# input_.shape = [N, C, T]
conv_list = []
conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
fc_forward = self.fc_forward1(highway)
fc_reverse = self.fc_reverse1(highway)
out_forward = self.gru_forward1(fc_forward)
out_reverse = self.gru_reverse1(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
fc_forward = self.fc_forward2(out)
fc_reverse = self.fc_reverse2(out)
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1])
return out
class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__()
self.num_units = num_units
self.num_layers = num_layers
self.gates = []
self.linears = []
for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units))
self.gates.append(dg.Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
def forward(self, input_):
out = input_
for linear, gate in zip(self.linears, self.gates):
h = fluid.layers.relu(linear(out))
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
return out

View File

@ -0,0 +1,206 @@
from parakeet.models.transformerTTS.module import *
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
from parakeet.modules.prenet import PreNet
from parakeet.modules.post_convnet import PostConvNet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha',
initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional):
if fluid.framework._dygraph_tracer()._train_mode:
query_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask(positional, x)
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x = ffn(x)
attentions.append(attention)
return x, query_mask, attentions
class Decoder(dg.Layer):
def __init__(self, num_hidden, config):
super(Decoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha')
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.linear = dg.Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = dg.Linear(num_hidden, 1)
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step,
use_cudnn = config.use_gpu)
def forward(self, key, value, query, c_mask, positional):
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
# Get position embedding
positional = self.pos_emb(positional)
query = positional * self.alpha + query
#positional dropout
query = fluid.layers.dropout(query, 0.1)
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
out = self.postconvnet(mel_out)
out = mel_out + out
# Stop tokens
stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1])
stop_tokens = layers.sigmoid(stop_tokens)
return mel_out, out, attn_list, stop_tokens, selfattn_list
class TransformerTTS(dg.Layer):
def __init__(self, config):
super(TransformerTTS, self).__init__()
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
self.decoder = Decoder(config.hidden_size, config)
self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# attn_probs (128, mel_len, seq_len)
# stop_preds (batch_size, mel_len, 1)
# attns_dec (128, mel_len, mel_len)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
class ModelPostNet(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, config):
super(ModelPostNet, self).__init__()
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
out_channels = config.hidden_size,
filter_size=1,
data_format = "NCT")
self.cbhg = CBHG(config.hidden_size, config.batch_size)
self.post_proj = Conv1D(in_channels = config.hidden_size,
out_channels = (config.audio.n_fft // 2) + 1,
filter_size=1,
data_format = "NCT")
def forward(self, mel):
mel = layers.transpose(mel, [0,2,1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1])
return mag_pred

View File

@ -0,0 +1,67 @@
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in network.")
parser.add_argument('--embedding_size', type=int, default=512,
help="the embedding vector size.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer in synthesis.")
parser.add_argument('--postnet_step', type=int, default=100000,
help="Global step to restore checkpoint of postnet in synthesis.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./log',
help="the directory to save audio sample in synthesis.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)

View File

@ -0,0 +1,123 @@
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from parakeet import g2p
from parakeet import audio
from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler
from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher
_ljspeech_processor = audio.AudioProcessor(
sample_rate=22050,
num_mels=80,
min_level_db=-100,
ref_level_db=20,
n_fft=2048,
win_length= int(22050 * 0.05),
hop_length= int(22050 * 0.0125),
power=1.2,
preemphasis=0.97,
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
mel_fmin=0,
mel_fmax=None,
clip_norm=True,
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
class LJSpeech(Dataset):
def __init__(self, root):
super(LJSpeech, self).__init__()
assert isinstance(root, (str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.metadata = self._prepare_metadata()
def _prepare_metadata(self):
csv_path = self.root.joinpath("metadata.csv")
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
names=["fname", "raw_text", "normalized_text"])
return metadata
def _get_example(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname, raw_text, normalized_text = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav")
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = _ljspeech_processor.load_wav(str(wav_path))
mag = _ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def __getitem__(self, index):
metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum)
return example
def __iter__(self):
for i in range(len(self)):
yield self[i]
def __len__(self):
return len(self.metadata)
def batch_examples(batch):
texts = []
mels = []
mel_inputs = []
text_lens = []
pos_texts = []
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
def batch_examples_vocoder(batch):
mels=[]
mags=[]
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
return (mels, mags)

View File

@ -0,0 +1,67 @@
import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
import numpy as np
from network import Model, ModelPostNet
from tqdm import tqdm
from tensorboardX import SummaryWriter
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
from preprocess import _ljspeech_processor
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
return model_dict
def synthesis(text_input, cfg):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
# tensorboard
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'synthesis')
writer = SummaryWriter(path)
with dg.guard(place):
model = Model(cfg)
model_postnet = ModelPostNet(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
# init input
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
model.eval()
model_postnet.eval()
pbar = tqdm(range(cfg.max_len))
for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
mag_pred = model_postnet(postnet_pred)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
if not os.path.exists(cfg.sample_path):
os.mkdir(cfg.sample_path)
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav)
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
synthesis("Transformer model is so fast!", cfg)

View File

@ -0,0 +1,111 @@
from network import *
from tensorboardX import SummaryWriter
import os
from tqdm import tqdm
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main(cfg):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'postnet')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
model = ModelPostNet(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
parameter_list=model.parameters())
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(cfg.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
mel, mag = data
mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy())
global_step += 1
mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
print("===============",model.pre_proj.conv.weight.numpy())
print("===============",model.pre_proj.conv.weight.gradient())
model.clear_gradients()
if local_rank==0:
writer.add_scalars('training_loss',{
'loss':loss.numpy(),
}, global_step)
if global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
writer.close()
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
main(cfg)

View File

@ -0,0 +1,150 @@
import os
from tqdm import tqdm
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from network import *
from tensorboardX import SummaryWriter
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
from matplotlib import cm
from parakeet.modules.utils import cross_entropy
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main(cfg):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
model = TransformerTTS(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data
global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
label = np.zeros(stop_preds.shape).astype(np.float32)
text_length = text_length.numpy()
for i in range(label.shape[0]):
label[i][text_length[i] - 1] = 1
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
loss = mel_loss + post_mel_loss + stop_loss
if local_rank==0:
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy(),
'stop_loss':stop_loss.numpy()
}, global_step)
writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(),
}, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if global_step % cfg.image_step == 1:
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_enc):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_dec):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
if cfg.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
writer.close()
if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
main(cfg)

View File

@ -0,0 +1,111 @@
# WaveFlow with Paddle Fluid
Paddle fluid implementation of [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219).
## Project Structure
```text
├── configs # yaml configuration files of preset model hyperparameters
├── benchmark.py # benchmark code to test the speed of batched speech synthesis
├── data.py # dataset and dataloader settings for LJSpeech
├── synthesis.py # script for speech synthesis
├── train.py # script for model training
├── utils.py # helper functions for e.g., model checkpointing
├── waveflow.py # WaveFlow model high level APIs
└── waveflow_modules.py # WaveFlow model implementation
```
## Usage
There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on.
We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset.
Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`.
For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`.
Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively.
### Dataset
Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar xjvf LJSpeech-1.1.tar.bz2
```
In this example, assume that the path of unzipped LJSpeech dataset is `./data/LJSpeech-1.1`.
### Train on single GPU
```bash
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
export CUDA_VISIBLE_DEVICES=0
python -u train.py \
--config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \
--name=${ModelName} --batch_size=4 \
--parallel=false --use_gpu=true
```
#### Save and Load checkpoints
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
2. Use `--iteration=500000`.
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`.
### Train on multiple GPUs
```bash
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -u -m paddle.distributed.launch train.py \
--config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \
--name=${ModelName} --parallel=true --use_gpu=true
```
Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode.
### Monitor with Tensorboard
By default, the logs are saved in `./runs/waveflow/${ModelName}/logs/`. You can monitor logs by tensorboard.
```bash
tensorboard --logdir=${log_dir} --port=8888
```
### Synthesize from a checkpoint
Check the [Save and load checkpoint](#save-and-load-checkpoints) section on how to load a specific checkpoint.
The following example will automatically load the latest checkpoint:
```bash
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
export CUDA_VISIBLE_DEVICES=0
python -u synthesis.py \
--config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \
--name=${ModelName} --use_gpu=true \
--output=./syn_audios \
--sample=${SAMPLE} \
--sigma=1.0
```
In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
### Benchmarking
Use the following example to benchmark the speed of batched speech synthesis, which reports how many times faster than real-time:
```bash
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
export CUDA_VISIBLE_DEVICES=0
python -u benchmark.py \
--config=./configs/waveflow_ljspeech.yaml \
--root=./data/LJSpeech-1.1 \
--name=${ModelName} --use_gpu=true
```

View File

@ -0,0 +1,71 @@
import os
import random
from pprint import pprint
import jsonargparse
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
import utils
from waveflow import WaveFlow
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow',
help="general name of the model")
parser.add_argument('--name', type=str,
help="specific name of the training model")
parser.add_argument('--root', type=str,
help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True,
help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None,
help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
help="path of the checkpoint to load")
def benchmark(config):
pprint(jsonargparse.namespace_to_dict(config))
# Get checkpoint directory path.
run_dir = os.path.join("runs", config.model, config.name)
checkpoint_dir = os.path.join(run_dir, "checkpoint")
# Configurate device.
place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
with dg.guard(place):
# Fix random seed.
seed = config.seed
random.seed(seed)
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed)
# Build model.
model = WaveFlow(config, checkpoint_dir)
model.build(training=False)
# Run model inference.
model.benchmark()
if __name__ == "__main__":
# Create parser.
parser = jsonargparse.ArgumentParser(
description="Synthesize audio using WaveNet model",
formatter_class='default_argparse')
add_options_to_parser(parser)
utils.add_config_options_to_parser(parser)
# Parse argument from both command line and yaml config file.
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
config = parser.parse_args()
benchmark(config)

View File

@ -0,0 +1,24 @@
valid_size: 16
segment_length: 16000
sample_rate: 22050
fft_window_shift: 256
fft_window_size: 1024
fft_size: 1024
mel_bands: 80
mel_fmin: 0.0
mel_fmax: 8000.0
seed: 1234
learning_rate: 0.0002
batch_size: 8
test_every: 2000
save_every: 10000
max_iterations: 3000000
sigma: 1.0
n_flows: 8
n_group: 16
n_layers: 8
n_channels: 64
kernel_h: 3
kernel_w: 3

View File

@ -0,0 +1,131 @@
import random
import librosa
import numpy as np
from paddle import fluid
from parakeet.datasets import ljspeech
from parakeet.data import dataset
from parakeet.data.batch import SpecBatcher, WavBatcher
from parakeet.data.datacargo import DataCargo
from parakeet.data.sampler import DistributedSampler, BatchSampler
from scipy.io.wavfile import read
class Dataset(ljspeech.LJSpeech):
def __init__(self, config):
super(Dataset, self).__init__(config.root)
self.config = config
def _get_example(self, metadatum):
fname, _, _ = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav")
loaded_sr, audio = read(wav_path)
assert loaded_sr == self.config.sample_rate
return audio
class Subset(dataset.Dataset):
def __init__(self, dataset, indices, valid):
self.dataset = dataset
self.indices = indices
self.valid = valid
self.config = dataset.config
def get_mel(self, audio):
spectrogram = librosa.core.stft(
audio, n_fft=self.config.fft_size,
hop_length=self.config.fft_window_shift,
win_length=self.config.fft_window_size)
spectrogram_magnitude = np.abs(spectrogram)
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
mel_filter_bank = librosa.filters.mel(
sr=self.config.sample_rate,
n_fft=self.config.fft_size,
n_mels=self.config.mel_bands,
fmin=self.config.mel_fmin,
fmax=self.config.mel_fmax)
# mel shape: [n_mels, num_frames]
mel = np.dot(mel_filter_bank, spectrogram_magnitude)
# Normalize mel.
clip_val = 1e-5
ref_constant = 1
mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant)
return mel
def __getitem__(self, idx):
audio = self.dataset[self.indices[idx]]
segment_length = self.config.segment_length
if self.valid:
# whole audio for valid set
pass
else:
# audio shape: [len]
if audio.shape[0] >= segment_length:
max_audio_start = audio.shape[0] - segment_length
audio_start = random.randint(0, max_audio_start)
audio = audio[audio_start : (audio_start + segment_length)]
else:
audio = np.pad(audio, (0, segment_length - audio.shape[0]),
mode='constant', constant_values=0)
# Normalize audio to the [-1, 1] range.
audio = audio.astype(np.float32) / 32768.0
mel = self.get_mel(audio)
return audio, mel
def _batch_examples(self, batch):
audios = [sample[0] for sample in batch]
mels = [sample[1] for sample in batch]
audios = WavBatcher(pad_value=0.0)(audios)
mels = SpecBatcher(pad_value=0.0)(mels)
return audios, mels
def __len__(self):
return len(self.indices)
class LJSpeech:
def __init__(self, config, nranks, rank):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
# Whole LJSpeech dataset.
ds = Dataset(config)
# Split into train and valid dataset.
indices = list(range(len(ds)))
train_indices = indices[config.valid_size:]
valid_indices = indices[:config.valid_size]
random.shuffle(train_indices)
# Train dataset.
trainset = Subset(ds, train_indices, valid=False)
sampler = DistributedSampler(len(trainset), nranks, rank)
total_bs = config.batch_size
assert total_bs % nranks == 0
train_sampler = BatchSampler(sampler, total_bs // nranks,
drop_last=True)
trainloader = DataCargo(trainset, batch_sampler=train_sampler)
trainreader = fluid.io.PyReader(capacity=50, return_list=True)
trainreader.decorate_batch_generator(trainloader, place)
self.trainloader = (data for _ in iter(int, 1)
for data in trainreader())
# Valid dataset.
validset = Subset(ds, valid_indices, valid=True)
# Currently only support batch_size = 1 for valid loader.
validloader = DataCargo(validset, batch_size=1, shuffle=False)
validreader = fluid.io.PyReader(capacity=20, return_list=True)
validreader.decorate_batch_generator(validloader, place)
self.validloader = validreader

View File

@ -0,0 +1,85 @@
import os
import random
from pprint import pprint
import jsonargparse
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
import utils
from waveflow import WaveFlow
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow',
help="general name of the model")
parser.add_argument('--name', type=str,
help="specific name of the training model")
parser.add_argument('--root', type=str,
help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True,
help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None,
help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
help="path of the checkpoint to load")
parser.add_argument('--output', type=str, default="./syn_audios",
help="path to write synthesized audio files")
parser.add_argument('--sample', type=int, default=None,
help="which of the valid samples to synthesize audio")
def synthesize(config):
pprint(jsonargparse.namespace_to_dict(config))
# Get checkpoint directory path.
run_dir = os.path.join("runs", config.model, config.name)
checkpoint_dir = os.path.join(run_dir, "checkpoint")
# Configurate device.
place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
with dg.guard(place):
# Fix random seed.
seed = config.seed
random.seed(seed)
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed)
# Build model.
model = WaveFlow(config, checkpoint_dir)
model.build(training=False)
# Obtain the current iteration.
if config.checkpoint is None:
if config.iteration is None:
iteration = utils.load_latest_checkpoint(checkpoint_dir)
else:
iteration = config.iteration
else:
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
# Run model inference.
model.infer(iteration)
if __name__ == "__main__":
# Create parser.
parser = jsonargparse.ArgumentParser(
description="Synthesize audio using WaveNet model",
formatter_class='default_argparse')
add_options_to_parser(parser)
utils.add_config_options_to_parser(parser)
# Parse argument from both command line and yaml config file.
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
config = parser.parse_args()
synthesize(config)

View File

@ -0,0 +1,114 @@
import os
import random
import subprocess
import time
from pprint import pprint
import jsonargparse
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
from tensorboardX import SummaryWriter
import slurm
import utils
from waveflow import WaveFlow
def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='waveflow',
help="general name of the model")
parser.add_argument('--name', type=str,
help="specific name of the training model")
parser.add_argument('--root', type=str,
help="root path of the LJSpeech dataset")
parser.add_argument('--parallel', type=bool, default=True,
help="option to use data parallel training")
parser.add_argument('--use_gpu', type=bool, default=True,
help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None,
help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None,
help="path of the checkpoint to load")
def train(config):
use_gpu = config.use_gpu
parallel = config.parallel if use_gpu else False
# Get the rank of the current training process.
rank = dg.parallel.Env().local_rank if parallel else 0
nranks = dg.parallel.Env().nranks if parallel else 1
if rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(config))
# Make checkpoint directory.
run_dir = os.path.join("runs", config.model, config.name)
checkpoint_dir = os.path.join(run_dir, "checkpoint")
os.makedirs(checkpoint_dir, exist_ok=True)
# Create tensorboard logger.
tb = SummaryWriter(os.path.join(run_dir, "logs")) \
if rank == 0 else None
# Configurate device
place = fluid.CUDAPlace(rank) if use_gpu else fluid.CPUPlace()
with dg.guard(place):
# Fix random seed.
seed = config.seed
random.seed(seed)
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed)
# Build model.
model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb)
model.build()
# Obtain the current iteration.
if config.checkpoint is None:
if config.iteration is None:
iteration = utils.load_latest_checkpoint(checkpoint_dir, rank)
else:
iteration = config.iteration
else:
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
while iteration < config.max_iterations:
# Run one single training step.
model.train_step(iteration)
iteration += 1
if iteration % config.test_every == 0:
# Run validation step.
model.valid_step(iteration)
if rank == 0 and iteration % config.save_every == 0:
# Save parameters.
model.save(iteration)
# Close TensorBoard.
if rank == 0:
tb.close()
if __name__ == "__main__":
# Create parser.
parser = jsonargparse.ArgumentParser(description="Train WaveFlow model",
formatter_class='default_argparse')
add_options_to_parser(parser)
utils.add_config_options_to_parser(parser)
# Parse argument from both command line and yaml config file.
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
config = parser.parse_args()
train(config)

View File

@ -0,0 +1,114 @@
import itertools
import os
import time
import jsonargparse
import numpy as np
import paddle.fluid.dygraph as dg
def add_config_options_to_parser(parser):
parser.add_argument('--valid_size', type=int,
help="size of the valid dataset")
parser.add_argument('--segment_length', type=int,
help="the length of audio clip for training")
parser.add_argument('--sample_rate', type=int,
help="sampling rate of audio data file")
parser.add_argument('--fft_window_shift', type=int,
help="the shift of fft window for each frame")
parser.add_argument('--fft_window_size', type=int,
help="the size of fft window for each frame")
parser.add_argument('--fft_size', type=int,
help="the size of fft filter on each frame")
parser.add_argument('--mel_bands', type=int,
help="the number of mel bands when calculating mel spectrograms")
parser.add_argument('--mel_fmin', type=float,
help="lowest frequency in calculating mel spectrograms")
parser.add_argument('--mel_fmax', type=float,
help="highest frequency in calculating mel spectrograms")
parser.add_argument('--seed', type=int,
help="seed of random initialization for the model")
parser.add_argument('--learning_rate', type=float)
parser.add_argument('--batch_size', type=int,
help="batch size for training")
parser.add_argument('--test_every', type=int,
help="test interval during training")
parser.add_argument('--save_every', type=int,
help="checkpointing interval during training")
parser.add_argument('--max_iterations', type=int,
help="maximum training iterations")
parser.add_argument('--sigma', type=float,
help="standard deviation of the latent Gaussian variable")
parser.add_argument('--n_flows', type=int,
help="number of flows")
parser.add_argument('--n_group', type=int,
help="number of adjacent audio samples to squeeze into one column")
parser.add_argument('--n_layers', type=int,
help="number of conv2d layer in one wavenet-like flow architecture")
parser.add_argument('--n_channels', type=int,
help="number of residual channels in flow")
parser.add_argument('--kernel_h', type=int,
help="height of the kernel in the conv2d layer")
parser.add_argument('--kernel_w', type=int,
help="width of the kernel in the conv2d layer")
parser.add_argument('--config', action=jsonargparse.ActionConfigFile)
def load_latest_checkpoint(checkpoint_dir, rank=0):
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
# Create checkpoint index file if not exist.
if (not os.path.isfile(checkpoint_path)) and rank == 0:
with open(checkpoint_path, "w") as handle:
handle.write("model_checkpoint_path: step-0")
# Make sure that other process waits until checkpoint file is created
# by process 0.
while not os.path.isfile(checkpoint_path):
time.sleep(1)
# Fetch the latest checkpoint index.
with open(checkpoint_path, "r") as handle:
latest_checkpoint = handle.readline().split()[-1]
iteration = int(latest_checkpoint.split("-")[-1])
return iteration
def save_latest_checkpoint(checkpoint_dir, iteration):
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
# Update the latest checkpoint index.
with open(checkpoint_path, "w") as handle:
handle.write("model_checkpoint_path: step-{}".format(iteration))
def load_parameters(checkpoint_dir, rank, model, optimizer=None,
iteration=None, file_path=None):
if file_path is None:
if iteration is None:
iteration = load_latest_checkpoint(checkpoint_dir, rank)
if iteration == 0:
return
file_path = "{}/step-{}".format(checkpoint_dir, iteration)
model_dict, optimizer_dict = dg.load_dygraph(file_path)
model.set_dict(model_dict)
print("[checkpoint] Rank {}: loaded model from {}".format(rank, file_path))
if optimizer and optimizer_dict:
optimizer.set_dict(optimizer_dict)
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
rank, file_path))
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
file_path = "{}/step-{}".format(checkpoint_dir, iteration)
model_dict = model.state_dict()
dg.save_dygraph(model_dict, file_path)
print("[checkpoint] Saved model to {}".format(file_path))
if optimizer:
opt_dict = optimizer.state_dict()
dg.save_dygraph(opt_dict, file_path)
print("[checkpoint] Saved optimzier state to {}".format(file_path))

View File

@ -0,0 +1,190 @@
import itertools
import os
import time
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
from scipy.io.wavfile import write
import utils
from data import LJSpeech
from waveflow_modules import WaveFlowLoss, WaveFlowModule
class WaveFlow():
def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
nranks=1, tb_logger=None):
self.config = config
self.checkpoint_dir = checkpoint_dir
self.parallel = parallel
self.rank = rank
self.nranks = nranks
self.tb_logger = tb_logger
def build(self, training=True):
config = self.config
dataset = LJSpeech(config, self.nranks, self.rank)
self.trainloader = dataset.trainloader
self.validloader = dataset.validloader
waveflow = WaveFlowModule("waveflow", config)
# Dry run once to create and initalize all necessary parameters.
audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32))
mel = dg.to_variable(
np.random.randn(1, config.mel_bands, 63).astype(np.float32))
waveflow(audio, mel)
if training:
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=config.learning_rate)
# Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank,
waveflow, optimizer,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank))
# Data parallelism.
if self.parallel:
strategy = dg.parallel.prepare_context()
waveflow = dg.parallel.DataParallel(waveflow, strategy)
self.waveflow = waveflow
self.optimizer = optimizer
self.criterion = WaveFlowLoss(config.sigma)
else:
# Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank, waveflow,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank))
self.waveflow = waveflow
def train_step(self, iteration):
self.waveflow.train()
start_time = time.time()
audios, mels = next(self.trainloader)
load_time = time.time()
outputs = self.waveflow(audios, mels)
loss = self.criterion(outputs)
if self.parallel:
# loss = loss / num_trainers
loss = self.waveflow.scale_loss(loss)
loss.backward()
self.waveflow.apply_collective_grads()
else:
loss.backward()
self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters())
self.waveflow.clear_gradients()
graph_time = time.time()
if self.rank == 0:
loss_val = float(loss.numpy()) * self.nranks
log = "Rank: {} Step: {:^8d} Loss: {:<8.3f} " \
"Time: {:.3f}/{:.3f}".format(
self.rank, iteration, loss_val,
load_time - start_time, graph_time - load_time)
print(log)
tb = self.tb_logger
tb.add_scalar("Train-Loss-Rank-0", loss_val, iteration)
@dg.no_grad
def valid_step(self, iteration):
self.waveflow.eval()
tb = self.tb_logger
total_loss = []
sample_audios = []
start_time = time.time()
for i, batch in enumerate(self.validloader()):
audios, mels = batch
valid_outputs = self.waveflow(audios, mels)
valid_z, valid_log_s_list = valid_outputs
# Visualize latent z and scale log_s.
if self.rank == 0 and i == 0:
tb.add_histogram("Valid-Latent_z", valid_z.numpy(), iteration)
for j, valid_log_s in enumerate(valid_log_s_list):
hist_name = "Valid-{}th-Flow-Log_s".format(j)
tb.add_histogram(hist_name, valid_log_s.numpy(), iteration)
valid_loss = self.criterion(valid_outputs)
total_loss.append(float(valid_loss.numpy()))
total_time = time.time() - start_time
if self.rank == 0:
loss_val = np.mean(total_loss)
log = "Test | Rank: {} AvgLoss: {:<8.3f} Time {:<8.3f}".format(
self.rank, loss_val, total_time)
print(log)
tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
@dg.no_grad
def infer(self, iteration):
self.waveflow.eval()
config = self.config
sample = config.sample
output = "{}/{}/iter-{}".format(config.output, config.name, iteration)
os.makedirs(output, exist_ok=True)
mels_list = [mels for _, mels in self.validloader()]
if sample is not None:
mels_list = [mels_list[sample]]
for sample, mel in enumerate(mels_list):
filename = "{}/valid_{}.wav".format(output, sample)
print("Synthesize sample {}, save as {}".format(sample, filename))
start_time = time.time()
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
syn_time = time.time() - start_time
audio = audio[0]
audio_time = audio.shape[0] / self.config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format(
audio_time, syn_time))
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
audio = audio.numpy() * 32768.0
audio = audio.astype('int16')
write(filename, config.sample_rate, audio)
@dg.no_grad
def benchmark(self):
self.waveflow.eval()
mels_list = [mels for _, mels in self.validloader()]
mel = fluid.layers.concat(mels_list, axis=2)
mel = mel[:, :, :864]
batch_size = 8
mel = fluid.layers.expand(mel, [batch_size, 1, 1])
for i in range(10):
start_time = time.time()
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
print("audio.shape = ", audio.shape)
syn_time = time.time() - start_time
audio_time = audio.shape[1] * batch_size / self.config.sample_rate
print("audio time {:.4f}, synthesis time {:.4f}".format(
audio_time, syn_time))
print("{} X real-time".format(audio_time / syn_time))
def save(self, iteration):
utils.save_latest_parameters(self.checkpoint_dir, iteration,
self.waveflow, self.optimizer)
utils.save_latest_checkpoint(self.checkpoint_dir, iteration)

View File

@ -0,0 +1,351 @@
import itertools
import numpy as np
import paddle.fluid.dygraph as dg
from paddle import fluid
from parakeet.modules import conv, modules, weight_norm
def set_param_attr(layer, c_in=1):
if isinstance(layer, (weight_norm.Conv2DTranspose, weight_norm.Conv2D)):
k = np.sqrt(1.0 / (c_in * np.prod(layer._filter_size)))
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
elif isinstance(layer, dg.Conv2D):
weight_init = fluid.initializer.ConstantInitializer(0.0)
bias_init = fluid.initializer.ConstantInitializer(0.0)
else:
raise TypeError("Unsupported layer type.")
layer._param_attr = fluid.ParamAttr(initializer=weight_init)
layer._bias_attr = fluid.ParamAttr(initializer=bias_init)
def unfold(x, n_group):
length = x.shape[-1]
new_shape = x.shape[:-1] + [length // n_group, n_group]
return fluid.layers.reshape(x, new_shape)
class WaveFlowLoss:
def __init__(self, sigma=1.0):
self.sigma = sigma
def __call__(self, model_output):
z, log_s_list = model_output
for i, log_s in enumerate(log_s_list):
if i == 0:
log_s_total = fluid.layers.reduce_sum(log_s)
else:
log_s_total = log_s_total + fluid.layers.reduce_sum(log_s)
loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \
- log_s_total
loss = loss / np.prod(z.shape)
const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
return loss + const
class Conditioner(dg.Layer):
def __init__(self, name_scope):
super(Conditioner, self).__init__(name_scope)
upsample_factors = [16, 16]
self.upsample_conv2d = []
for s in upsample_factors:
in_channel = 1
conv_trans2d = modules.Conv2DTranspose(
self.full_name(),
num_filters=1,
filter_size=(3, 2 * s),
padding=(1, s // 2),
stride=(1, s))
set_param_attr(conv_trans2d, c_in=in_channel)
self.upsample_conv2d.append(conv_trans2d)
for i, layer in enumerate(self.upsample_conv2d):
self.add_sublayer("conv2d_transpose_{}".format(i), layer)
def forward(self, x):
x = fluid.layers.unsqueeze(x, 1)
for layer in self.upsample_conv2d:
x = fluid.layers.leaky_relu(layer(x), alpha=0.4)
return fluid.layers.squeeze(x, [1])
def infer(self, x):
x = fluid.layers.unsqueeze(x, 1)
for layer in self.upsample_conv2d:
x = layer(x)
# Trim conv artifacts.
time_cutoff = layer._filter_size[1] - layer._stride[1]
x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4)
return fluid.layers.squeeze(x, [1])
class Flow(dg.Layer):
def __init__(self, name_scope, config):
super(Flow, self).__init__(name_scope)
self.n_layers = config.n_layers
self.n_channels = config.n_channels
self.kernel_h = config.kernel_h
self.kernel_w = config.kernel_w
# Transform audio: [batch, 1, n_group, time/n_group]
# => [batch, n_channels, n_group, time/n_group]
self.start = weight_norm.Conv2D(
self.full_name(),
num_filters=self.n_channels,
filter_size=(1, 1))
set_param_attr(self.start, c_in=1)
# Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability
# output shape: [batch, 2, n_group, time/n_group]
self.end = dg.Conv2D(
self.full_name(),
num_filters=2,
filter_size=(1, 1))
set_param_attr(self.end)
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
dilation_dict = {8: [1, 1, 1, 1, 1, 1, 1, 1],
16: [1, 1, 1, 1, 1, 1, 1, 1],
32: [1, 2, 4, 1, 2, 4, 1, 2],
64: [1, 2, 4, 8, 16, 1, 2, 4],
128: [1, 2, 4, 8, 16, 32, 64, 1]}
self.dilation_h_list = dilation_dict[config.n_group]
self.in_layers = []
self.cond_layers = []
self.res_skip_layers = []
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i
in_layer = weight_norm.Conv2D(
self.full_name(),
num_filters=2 * self.n_channels,
filter_size=(self.kernel_h, self.kernel_w),
dilation=(dilation_h, dilation_w))
set_param_attr(in_layer, c_in=self.n_channels)
self.in_layers.append(in_layer)
cond_layer = weight_norm.Conv2D(
self.full_name(),
num_filters=2 * self.n_channels,
filter_size=(1, 1))
set_param_attr(cond_layer, c_in=config.mel_bands)
self.cond_layers.append(cond_layer)
if i < self.n_layers - 1:
res_skip_channels = 2 * self.n_channels
else:
res_skip_channels = self.n_channels
res_skip_layer = weight_norm.Conv2D(
self.full_name(),
num_filters=res_skip_channels,
filter_size=(1, 1))
set_param_attr(res_skip_layer, c_in=self.n_channels)
self.res_skip_layers.append(res_skip_layer)
self.add_sublayer("in_layer_{}".format(i), in_layer)
self.add_sublayer("cond_layer_{}".format(i), cond_layer)
self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer)
def forward(self, audio, mel):
# audio: [bs, 1, n_group, time/group]
# mel: [bs, mel_bands, n_group, time/n_group]
audio = self.start(audio)
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
pad_left = pad_right = int((self.kernel_w-1) * dilation_w / 2)
audio_pad = fluid.layers.pad2d(audio,
paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](audio_pad)
cond_hidden = self.cond_layers[i](mel)
in_acts = hidden + cond_hidden
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
res_skip_acts = self.res_skip_layers[i](out_acts)
if i < self.n_layers - 1:
audio += res_skip_acts[:, :self.n_channels, :, :]
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
else:
skip_acts = res_skip_acts
if i == 0:
output = skip_acts
else:
output += skip_acts
return self.end(output)
def infer(self, audio, mel, queues):
audio = self.start(audio)
for i in range(self.n_layers):
dilation_h = self.dilation_h_list[i]
dilation_w = 2 ** i
state_size = dilation_h * (self.kernel_h - 1)
queue = queues[i]
if len(queue) == 0:
for j in range(state_size):
queue.append(fluid.layers.zeros_like(audio))
state = queue[0:state_size]
state = fluid.layers.concat([*state, audio], axis=2)
queue.pop(0)
queue.append(audio)
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top, pad_bottom = 0, 0
pad_left = int((self.kernel_w-1) * dilation_w / 2)
pad_right = int((self.kernel_w-1) * dilation_w / 2)
state = fluid.layers.pad2d(state,
paddings=[pad_top, pad_bottom, pad_left, pad_right])
hidden = self.in_layers[i](state)
cond_hidden = self.cond_layers[i](mel)
in_acts = hidden + cond_hidden
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
res_skip_acts = self.res_skip_layers[i](out_acts)
if i < self.n_layers - 1:
audio += res_skip_acts[:, :self.n_channels, :, :]
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
else:
skip_acts = res_skip_acts
if i == 0:
output = skip_acts
else:
output += skip_acts
return self.end(output)
class WaveFlowModule(dg.Layer):
def __init__(self, name_scope, config):
super(WaveFlowModule, self).__init__(name_scope)
self.n_flows = config.n_flows
self.n_group = config.n_group
self.n_layers = config.n_layers
assert self.n_group % 2 == 0
assert self.n_flows % 2 == 0
self.conditioner = Conditioner(self.full_name())
self.flows = []
for i in range(self.n_flows):
flow = Flow(self.full_name(), config)
self.flows.append(flow)
self.add_sublayer("flow_{}".format(i), flow)
self.perms = []
half = self.n_group // 2
for i in range(self.n_flows):
perm = list(range(self.n_group))
if i < self.n_flows // 2:
perm = perm[::-1]
else:
perm[:half] = reversed(perm[:half])
perm[half:] = reversed(perm[half:])
self.perms.append(perm)
def forward(self, audio, mel):
mel = self.conditioner(mel)
assert mel.shape[2] >= audio.shape[1]
# Prune out the tail of audio/mel so that time/n_group == 0.
pruned_len = audio.shape[1] // self.n_group * self.n_group
if audio.shape[1] > pruned_len:
audio = audio[:, :pruned_len]
if mel.shape[2] > pruned_len:
mel = mel[:, :, :pruned_len]
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
# From [bs, time] to [bs, n_group, time/n_group]
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
# [bs, 1, n_group, time/n_group]
audio = fluid.layers.unsqueeze(audio, 1)
log_s_list = []
for i in range(self.n_flows):
inputs = audio[:, :, :-1, :]
conds = mel[:, :, 1:, :]
outputs = self.flows[i](inputs, conds)
log_s = outputs[:, :1, :, :]
b = outputs[:, 1:, :, :]
log_s_list.append(log_s)
audio_0 = audio[:, :, :1, :]
audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b
audio = fluid.layers.concat([audio_0, audio_out], axis=2)
# Permute over the height dim.
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
audio = fluid.layers.stack(audio_slices, axis=2)
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
mel = fluid.layers.stack(mel_slices, axis=2)
z = fluid.layers.squeeze(audio, [1])
return z, log_s_list
def synthesize(self, mel, sigma=1.0):
mel = self.conditioner.infer(mel)
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
audio = fluid.layers.gaussian_random(
shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma)
for i in reversed(range(self.n_flows)):
# Permute over the height dimension.
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
audio = fluid.layers.stack(audio_slices, axis=2)
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
mel = fluid.layers.stack(mel_slices, axis=2)
audio_list = []
audio_0 = audio[:, :, 0:1, :]
audio_list.append(audio_0)
audio_h = audio_0
queues = [[] for _ in range(self.n_layers)]
for h in range(1, self.n_group):
inputs = audio_h
conds = mel[:, :, h:(h+1), :]
outputs = self.flows[i].infer(inputs, conds, queues)
log_s = outputs[:, 0:1, :, :]
b = outputs[:, 1:, :, :]
audio_h = (audio[:, :, h:(h+1), :] - b) / \
fluid.layers.exp(log_s)
audio_list.append(audio_h)
audio = fluid.layers.concat(audio_list, axis=2)
# audio: [bs, n_group, time/n_group]
audio = fluid.layers.squeeze(audio, [1])
# audio: [bs, time]
audio = fluid.layers.reshape(
fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1])
return audio

View File

@ -0,0 +1,52 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class DynamicGRU(dg.Layer):
def __init__(self,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__()
self.gru_unit = dg.GRUUnit(
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
"""
Dynamic GRU block.
Args:
input (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
output (Variable), Shape(B, T, C), the result compute by GRU.
"""
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = layers.concat(res, axis=1)
return res

View File

@ -0,0 +1,52 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
import math
from parakeet.modules.layers import Conv
class PositionwiseFeedForward(dg.Layer):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.dropout = dropout
self.w_1 = Conv(in_channels = d_in,
out_channels = num_hidden,
filter_size = filter_size,
padding=padding,
use_cudnn = use_cudnn,
data_format = "NTC")
self.w_2 = Conv(in_channels = num_hidden,
out_channels = d_in,
filter_size = filter_size,
padding=padding,
use_cudnn = use_cudnn,
data_format = "NTC")
self.layer_norm = dg.LayerNorm(d_in)
def forward(self, input):
"""
Feed Forward Network.
Args:
input (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
output (Variable), Shape(B, T, C), the result after FFN.
"""
#FFN Networt
x = self.w_2(layers.relu(self.w_1(input)))
# dropout
x = layers.dropout(x, self.dropout)
# residual connection
x = x + input
#layer normalization
output = self.layer_norm(x)
return output

158
parakeet/modules/layers.py Normal file
View File

@ -0,0 +1,158 @@
import math
import numpy as np
import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Conv(dg.Layer):
def __init__(self, in_channels, out_channels, filter_size=1,
padding=0, dilation=1, stride=1, use_cudnn=True,
data_format="NCT", is_bias=True):
super(Conv, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_size = filter_size
self.padding = padding
self.dilation = dilation
self.stride = stride
self.use_cudnn = use_cudnn
self.data_format = data_format
self.is_bias = is_bias
self.weight_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer())
self.bias_attr = None
if is_bias is not False:
k = math.sqrt(1 / in_channels)
self.bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))
self.conv = Conv1D( in_channels = in_channels,
out_channels = out_channels,
filter_size = filter_size,
padding = padding,
dilation = dilation,
stride = stride,
param_attr = self.weight_attr,
bias_attr = self.bias_attr,
use_cudnn = use_cudnn,
data_format = data_format)
def forward(self, x):
x = self.conv(x)
return x
class Conv1D(dg.Layer):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def __init__(self,
in_channels,
out_channels,
filter_size=3,
padding=0,
dilation=1,
stride=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
data_format='NCT',
dtype="float32"):
super(Conv1D, self).__init__(dtype=dtype)
self.padding = padding
self.in_channels = in_channels
self.num_filters = out_channels
self.filter_size = filter_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.act = act
self.data_format = data_format
self.conv = dg.Conv2D(
num_channels=in_channels,
num_filters=out_channels,
filter_size=(1, filter_size),
stride=(1, stride),
dilation=(1, dilation),
padding=(0, padding),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.conv(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
pool_size=-1,
pool_type='max',
pool_stride=1,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
exclusive=True,
data_format='NCT'):
super(Pool1D, self).__init__()
self.pool_size = pool_size
self.pool_type = pool_type
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.global_pooling = global_pooling
self.use_cudnn = use_cudnn
self.ceil_mode = ceil_mode
self.exclusive = exclusive
self.data_format = data_format
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.pool2d(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x

View File

@ -0,0 +1,112 @@
import math
import numpy as np
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key):
super(ScaledDotProductAttention, self).__init__()
self.d_key = d_key
# please attention this mask is diff from pytorch
def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
"""
Scaled Dot Product Attention.
Args:
key (Variable): Shape(B, T, C), dtype: float32. The input key of attention.
value (Variable): Shape(B, T, C), dtype: float32. The input value of attention.
query (Variable): Shape(B, T, C), dtype: float32. The input query of attention.
mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key.
query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query.
dropout (Constant): dtype: float32. The probability of dropout.
Returns:
result (Variable), Shape(B, T, C), the result of mutihead attention.
attention (Variable), Shape(n_head * B, T, C), the attention of key.
"""
# Compute attention score
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
attention = attention / math.sqrt(self.d_key)
# Mask key to ignore padding
if mask is not None:
attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
attention = attention + mask
attention = layers.softmax(attention)
attention = layers.dropout(attention, dropout)
# Mask query to ignore padding
if query_mask is not None:
attention = attention * query_mask
result = layers.matmul(attention, value)
return result, attention
class MultiheadAttention(dg.Layer):
def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1):
super(MultiheadAttention, self).__init__()
self.num_hidden = num_hidden
self.num_head = num_head
self.d_k = d_k
self.d_q = d_q
self.dropout = dropout
self.key = dg.Linear(num_hidden, num_head * d_k)
self.value = dg.Linear(num_hidden, num_head * d_k)
self.query = dg.Linear(num_hidden, num_head * d_q)
self.scal_attn = ScaledDotProductAttention(d_k)
self.fc = dg.Linear(num_head * d_q, num_hidden)
self.layer_norm = dg.LayerNorm(num_hidden)
def forward(self, key, value, query_input, mask=None, query_mask=None):
"""
Multihead Attention.
Args:
key (Variable): Shape(B, T, C), dtype: float32. The input key of attention.
value (Variable): Shape(B, T, C), dtype: float32. The input value of attention.
query_input (Variable): Shape(B, T, C), dtype: float32. The input query of attention.
mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key.
query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query.
Returns:
result (Variable), Shape(B, T, C), the result of mutihead attention.
attention (Variable), Shape(n_head * B, T, C), the attention of key.
"""
batch_size = key.shape[0]
seq_len_key = key.shape[1]
seq_len_query = query_input.shape[1]
# repeat masks h times
if query_mask is not None:
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input
result = self.layer_norm(result)
return result, attention

View File

@ -0,0 +1,75 @@
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv
class PostConvNet(dg.Layer):
def __init__(self,
n_mels=80,
num_hidden=512,
filter_size=5,
padding=0,
num_conv=5,
outputs_per_step=1,
use_cudnn=True,
dropout=0.1):
super(PostConvNet, self).__init__()
self.dropout = dropout
self.conv_list = []
self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step,
out_channels = num_hidden,
filter_size = filter_size,
padding = padding,
use_cudnn = use_cudnn,
data_format = "NCT"))
for _ in range(1, num_conv-1):
self.conv_list.append(Conv(in_channels = num_hidden,
out_channels = num_hidden,
filter_size = filter_size,
padding = padding,
use_cudnn = use_cudnn,
data_format = "NCT") )
self.conv_list.append(Conv(in_channels = num_hidden,
out_channels = n_mels * outputs_per_step,
filter_size = filter_size,
padding = padding,
use_cudnn = use_cudnn,
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(num_conv-1)]
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
def forward(self, input):
"""
Post Conv Net.
Args:
input (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
output (Variable), Shape(B, T, C), the result after postconvnet.
"""
input = layers.transpose(input, [0,2,1])
len = input.shape[-1]
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
output = layers.transpose(input, [0,2,1])
return output

View File

@ -0,0 +1,31 @@
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
"""
:param input_size: dimension of input
:param hidden_size: dimension of hidden unit
:param output_size: dimension of output
"""
super(PreNet, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_rate = dropout_rate
self.linear1 = dg.Linear(input_size, hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size)
def forward(self, x):
"""
Pre Net before passing through the network.
Args:
x (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
x (Variable), Shape(B, T, C), the result after pernet.
"""
x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate)
x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate)
return x

73
parakeet/modules/utils.py Normal file
View File

@ -0,0 +1,73 @@
import numpy as np
import librosa
import os, copy
from scipy import signal
import paddle.fluid.layers as layers
def get_positional_table(d_pos_vec, n_position=1024):
position_enc = np.array([
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.
return sinusoid_table
def get_non_pad_mask(seq):
return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
def get_attn_key_pad_mask(seq_k, seq_q):
''' For masking out the padding part of key sequence. '''
# Expand to fit the shape of key query attention matrix.
len_q = seq_q.shape[1]
padding_mask = (seq_k != 0).astype(np.float32)
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1])
return padding_mask
def get_triu_tensor(seq_k, seq_q):
''' For make a triu tensor '''
len_k = seq_k.shape[1]
len_q = seq_q.shape[1]
batch_size = seq_k.shape[0]
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
return triu_tensor
def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32)
for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
return W
def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001):
input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
label = input * (label * (position_weight - 1) + 1)
return layers.reduce_sum(label, dim=[0, 1])