Merge branch 'add_TranTTS' into 'master'
Add tran tts See merge request !1
This commit is contained in:
commit
ca5d57c1c7
|
@ -0,0 +1 @@
|
|||
from .audio import AudioProcessor
|
|
@ -0,0 +1,261 @@
|
|||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import scipy.io
|
||||
import scipy.signal
|
||||
|
||||
class AudioProcessor(object):
|
||||
def __init__(self,
|
||||
sample_rate=None, # int, sampling rate
|
||||
num_mels=None, # int, bands of mel spectrogram
|
||||
min_level_db=None, # float, minimum level db
|
||||
ref_level_db=None, # float, reference level db
|
||||
n_fft=None, # int: number of samples in a frame for stft
|
||||
win_length=None, # int: the same meaning with n_fft
|
||||
hop_length=None, # int: number of samples between neighboring frame
|
||||
power=None, # float:power to raise before griffin-lim
|
||||
preemphasis=None, # float: preemphasis coefficident
|
||||
signal_norm=None, #
|
||||
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
|
||||
max_norm=None, # float, max norm
|
||||
mel_fmin=None, # int: mel spectrogram's minimum frequency
|
||||
mel_fmax=None, # int: mel spectrogram's maximum frequency
|
||||
clip_norm=True, # bool: clip spectrogram's norm
|
||||
griffin_lim_iters=None, # int:
|
||||
do_trim_silence=False, # bool: trim silence
|
||||
sound_norm=False,
|
||||
**kwargs):
|
||||
self.sample_rate = sample_rate
|
||||
self.num_mels = num_mels
|
||||
self.min_level_db = min_level_db
|
||||
self.ref_level_db = ref_level_db
|
||||
|
||||
# stft related
|
||||
self.n_fft = n_fft
|
||||
self.win_length = win_length or n_fft
|
||||
# hop length defaults to 1/4 window_length
|
||||
self.hop_length = hop_length or 0.25 * self.win_length
|
||||
|
||||
self.power = power
|
||||
self.preemphasis = float(preemphasis)
|
||||
|
||||
self.griffin_lim_iters = griffin_lim_iters
|
||||
self.signal_norm = signal_norm
|
||||
self.symmetric_norm = symmetric_norm
|
||||
|
||||
# mel transform related
|
||||
self.mel_fmin = mel_fmin
|
||||
self.mel_fmax = mel_fmax
|
||||
|
||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
|
||||
self.sound_norm = sound_norm
|
||||
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
|
||||
|
||||
def _stft_parameters(self):
|
||||
"""compute frame length and hop length in ms"""
|
||||
frame_length_ms = self.win_length * 1. / self.sample_rate
|
||||
frame_shift_ms = self.hop_length * 1. / self.sample_rate
|
||||
num_freq = 1 + self.n_fft // 2
|
||||
return num_freq, frame_length_ms, frame_shift_ms
|
||||
|
||||
def __repr__(self):
|
||||
"""object repr"""
|
||||
cls_name_str = self.__class__.__name__
|
||||
members = vars(self)
|
||||
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()])
|
||||
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
|
||||
return repr_str
|
||||
|
||||
def save_wav(self, path, wav):
|
||||
"""save audio with scipy.io.wavfile in 16bit integers"""
|
||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
|
||||
|
||||
def load_wav(self, path, sr=None):
|
||||
"""load wav -> trim_silence -> rescale"""
|
||||
|
||||
x, sr = librosa.load(path, sr=None)
|
||||
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
|
||||
if self.do_trim_silence:
|
||||
try:
|
||||
x = self.trim_silence(x)
|
||||
except ValueError:
|
||||
print(" [!] File cannot be trimmed for silence - {}".format(path))
|
||||
if self.sound_norm:
|
||||
x = x / x.max() * 0.9 # why 0.9 ?
|
||||
return x
|
||||
|
||||
def trim_silence(self, wav):
|
||||
"""Trim soilent parts with a threshold and 0.01s margin"""
|
||||
margin = int(self.sample_rate * 0.01)
|
||||
wav = wav[margin: -margin]
|
||||
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
|
||||
return trimed_wav
|
||||
|
||||
def apply_preemphasis(self, x):
|
||||
if self.preemphasis == 0.:
|
||||
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
|
||||
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
|
||||
|
||||
def apply_inv_preemphasis(self, x):
|
||||
if self.preemphasis == 0.:
|
||||
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
|
||||
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
|
||||
|
||||
def _amplitude_to_db(self, x):
|
||||
amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))
|
||||
return 20 * np.log10(np.maximum(amplitude_min, x))
|
||||
|
||||
@staticmethod
|
||||
def _db_to_amplitude(x):
|
||||
return np.power(10., 0.05 * x)
|
||||
|
||||
def _linear_to_mel(self, spectrogram):
|
||||
_mel_basis = self._build_mel_basis()
|
||||
return np.dot(_mel_basis, spectrogram)
|
||||
|
||||
def _mel_to_linear(self, mel_spectrogram):
|
||||
inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
|
||||
return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram))
|
||||
|
||||
def _build_mel_basis(self):
|
||||
"""return mel basis for mel scale"""
|
||||
if self.mel_fmax is not None:
|
||||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
return librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax)
|
||||
|
||||
def _normalize(self, S):
|
||||
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
|
||||
if self.signal_norm:
|
||||
S_norm = (S - self.min_level_db) / (-self.min_level_db)
|
||||
if self.symmetric_norm:
|
||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
S_norm = self.max_norm * S_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, 0, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
return S
|
||||
|
||||
def _denormalize(self, S):
|
||||
"""denormalize values"""
|
||||
S_denorm = S
|
||||
if self.signal_norm:
|
||||
if self.symmetric_norm:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
|
||||
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
|
||||
return S_denorm
|
||||
else:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, 0, self.max_norm)
|
||||
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
|
||||
return S_denorm
|
||||
else:
|
||||
return S
|
||||
|
||||
def _stft(self, y):
|
||||
return librosa.stft(
|
||||
y=y,
|
||||
n_fft=self.n_fft,
|
||||
win_length=self.win_length,
|
||||
hop_length=self.hop_length)
|
||||
|
||||
def _istft(self, S):
|
||||
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
|
||||
|
||||
def spectrogram(self, y):
|
||||
"""compute linear spectrogram(amplitude)
|
||||
preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize
|
||||
"""
|
||||
if self.preemphasis:
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db
|
||||
return self._normalize(S)
|
||||
|
||||
def melspectrogram(self, y):
|
||||
"""compute linear spectrogram(amplitude)
|
||||
preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize
|
||||
"""
|
||||
if self.preemphasis:
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
|
||||
return self._normalize(S)
|
||||
|
||||
def inv_spectrogram(self, spectrogram):
|
||||
"""convert spectrogram back to waveform using griffin_lim in librosa"""
|
||||
S = self._denormalize(spectrogram)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
if self.preemphasis:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||
return self._griffin_lim(S ** self.power)
|
||||
|
||||
def inv_melspectrogram(self, mel_spectrogram):
|
||||
S = self._denormalize(mel_spectrogram)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
S = self._linear_to_mel(np.abs(S))
|
||||
if self.preemphasis:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||
return self._griffin_lim(S ** self.power)
|
||||
|
||||
def out_linear_to_mel(self, linear_spec):
|
||||
"""convert output linear spec to mel spec"""
|
||||
S = self._denormalize(linear_spec)
|
||||
S = self._db_to_amplitude(S + self.ref_level_db)
|
||||
S = self._linear_to_mel(np.abs(S))
|
||||
S = self._amplitude_to_db(S) - self.ref_level_db
|
||||
mel = self._normalize(S)
|
||||
return mel
|
||||
|
||||
def _griffin_lim(self, S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = self._istft(S_complex * angles)
|
||||
for _ in range(self.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||
y = self._istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def mulaw_encode(wav, qc):
|
||||
mu = 2 ** qc - 1
|
||||
# wav_abs = np.minimum(np.abs(wav), 1.0)
|
||||
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
|
||||
# Quantize signal to the specified number of levels.
|
||||
signal = (signal + 1) / 2 * mu + 0.5
|
||||
return np.floor(signal,)
|
||||
|
||||
@staticmethod
|
||||
def mulaw_decode(wav, qc):
|
||||
"""Recovers waveform from quantized values."""
|
||||
mu = 2 ** qc - 1
|
||||
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def encode_16bits(x):
|
||||
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
|
||||
|
||||
@staticmethod
|
||||
def quantize(x, bits):
|
||||
return (x + 1.) * (2**bits - 1) / 2
|
||||
|
||||
@staticmethod
|
||||
def dequantize(x, bits):
|
||||
return 2 * x / (2**bits - 1) - 1
|
|
@ -2,7 +2,8 @@ from .sampler import SequentialSampler, RandomSampler, BatchSampler
|
|||
|
||||
class DataCargo(object):
|
||||
def __init__(self, dataset, batch_size=1, sampler=None,
|
||||
shuffle=False, batch_sampler=None, drop_last=False):
|
||||
shuffle=False, batch_sampler=None, collate_fn=None,
|
||||
drop_last=False):
|
||||
self.dataset = dataset
|
||||
|
||||
if batch_sampler is not None:
|
||||
|
@ -21,13 +22,20 @@ class DataCargo(object):
|
|||
sampler = RandomSampler(dataset)
|
||||
else:
|
||||
sampler = SequentialSampler(dataset)
|
||||
# auto_collation without custom batch_sampler
|
||||
batch_sampler = BatchSampler(sampler, batch_size, drop_last)
|
||||
else:
|
||||
batch_sampler = BatchSampler(sampler, batch_size, drop_last)
|
||||
|
||||
self.batch_sampler = batch_sampler
|
||||
|
||||
if collate_fn is None:
|
||||
collate_fn = dataset._batch_examples
|
||||
self.collate_fn = collate_fn
|
||||
|
||||
self.batch_size = batch_size
|
||||
self.drop_last = drop_last
|
||||
self.sampler = sampler
|
||||
self.batch_sampler = batch_sampler
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
return DataIterator(self)
|
||||
|
@ -57,6 +65,7 @@ class DataIterator(object):
|
|||
|
||||
self._index_sampler = loader._index_sampler
|
||||
self._sampler_iter = iter(self._index_sampler)
|
||||
self.collate_fn = loader.collate_fn
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
@ -64,7 +73,7 @@ class DataIterator(object):
|
|||
def __next__(self):
|
||||
index = self._next_index() # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size
|
||||
minibatch = [self._dataset[i] for i in index] # we can abstract it, too to use dynamic batch size
|
||||
minibatch = self._dataset._batch_examples(minibatch) # list[Example] -> Batch
|
||||
minibatch = self.collate_fn(minibatch)
|
||||
return minibatch
|
||||
|
||||
def _next_index(self):
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import librosa
|
||||
|
||||
from paddle import fluid
|
||||
from parakeet import g2p
|
||||
from parakeet import audio
|
||||
from parakeet.data.sampler import *
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.dataset import Dataset
|
||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
|
||||
class LJSpeechLoader:
|
||||
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True):
|
||||
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||
|
||||
LJSPEECH_ROOT = Path(config.data_path)
|
||||
dataset = LJSpeech(LJSPEECH_ROOT, config)
|
||||
sampler = DistributedSampler(len(dataset), nranks, rank, shuffle=shuffle)
|
||||
|
||||
assert config.batch_size % nranks == 0
|
||||
each_bs = config.batch_size // nranks
|
||||
if is_vocoder:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples_vocoder, drop_last=True)
|
||||
else:
|
||||
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples, drop_last=True)
|
||||
|
||||
self.reader = fluid.io.DataLoader.from_generator(
|
||||
capacity=32,
|
||||
iterable=True,
|
||||
use_double_buffer=True,
|
||||
return_list=True)
|
||||
self.reader.set_batch_generator(dataloader, place)
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
def __init__(self, root, config):
|
||||
super(LJSpeech, self).__init__()
|
||||
assert isinstance(root, (str, Path)), "root should be a string or Path object"
|
||||
self.root = root if isinstance(root, Path) else Path(root)
|
||||
self.metadata = self._prepare_metadata()
|
||||
self.config = config
|
||||
self._ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=config.audio.sr,
|
||||
num_mels=config.audio.num_mels,
|
||||
min_level_db=config.audio.min_level_db,
|
||||
ref_level_db=config.audio.ref_level_db,
|
||||
n_fft=config.audio.n_fft,
|
||||
win_length= config.audio.win_length,
|
||||
hop_length= config.audio.hop_length,
|
||||
power=config.audio.power,
|
||||
preemphasis=config.audio.preemphasis,
|
||||
signal_norm=True,
|
||||
symmetric_norm=False,
|
||||
max_norm=1.,
|
||||
mel_fmin=0,
|
||||
mel_fmax=None,
|
||||
clip_norm=True,
|
||||
griffin_lim_iters=60,
|
||||
do_trim_silence=False,
|
||||
sound_norm=False)
|
||||
|
||||
def _prepare_metadata(self):
|
||||
csv_path = self.root.joinpath("metadata.csv")
|
||||
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
return metadata
|
||||
|
||||
def _get_example(self, metadatum):
|
||||
"""All the code for generating an Example from a metadatum. If you want a
|
||||
different preprocessing pipeline, you can override this method.
|
||||
This method may require several processor, each of which has a lot of options.
|
||||
In this case, you'd better pass a composed transform and pass it to the init
|
||||
method.
|
||||
"""
|
||||
|
||||
fname, raw_text, normalized_text = metadatum
|
||||
wav_path = self.root.joinpath("wavs", fname + ".wav")
|
||||
|
||||
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||
wav = self._ljspeech_processor.load_wav(str(wav_path))
|
||||
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
|
||||
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
|
||||
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.metadata.iloc[index]
|
||||
example = self._get_example(metadatum)
|
||||
return example
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(len(self)):
|
||||
yield self[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.metadata)
|
||||
|
||||
|
||||
def batch_examples(batch):
|
||||
texts = []
|
||||
mels = []
|
||||
mel_inputs = []
|
||||
text_lens = []
|
||||
pos_texts = []
|
||||
pos_mels = []
|
||||
for data in batch:
|
||||
_, mel, text = data
|
||||
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
|
||||
text_lens.append(len(text))
|
||||
pos_texts.append(np.arange(1, len(text) + 1))
|
||||
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
text_lens = sorted(text_lens, reverse=True)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
|
||||
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
|
||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
|
||||
|
||||
def batch_examples_vocoder(batch):
|
||||
mels=[]
|
||||
mags=[]
|
||||
for data in batch:
|
||||
mag, mel, _ = data
|
||||
mels.append(mel)
|
||||
mags.append(mag)
|
||||
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
|
||||
|
||||
return (mels, mags)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
encoder_n_layer: 6
|
||||
encoder_head: 2
|
||||
encoder_conv1d_filter_size: 1536
|
||||
max_sep_len: 2048
|
||||
encoder_output_size: 384
|
||||
word_vec_dim: 384
|
||||
decoder_n_layer: 6
|
||||
decoder_head: 2
|
||||
decoder_conv1d_filter_size: 1536
|
||||
decoder_output_size: 384
|
||||
d_model: 384
|
||||
duration_predictor_output_size: 256
|
||||
duration_predictor_filter_size: 3
|
||||
fft_conv1d_filter: 3
|
||||
fft_conv1d_padding: 1
|
||||
|
||||
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 500
|
||||
image_step: 2000
|
||||
use_gpu: False
|
||||
use_data_parallel: False
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
transtts_path: ./checkpoint
|
||||
transformer_step: 70000
|
||||
log_dir: ./log
|
|
@ -0,0 +1,43 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
encoder_n_layer: 6
|
||||
encoder_head: 2
|
||||
encoder_conv1d_filter_size: 1536
|
||||
max_sep_len: 2048
|
||||
encoder_output_size: 384
|
||||
embedding_size: 384
|
||||
decoder_n_layer: 6
|
||||
decoder_head: 2
|
||||
decoder_conv1d_filter_size: 1536
|
||||
decoder_output_size: 384
|
||||
hidden_size: 384
|
||||
duration_predictor_output_size: 256
|
||||
duration_predictor_filter_size: 3
|
||||
fft_conv1d_filter: 3
|
||||
fft_conv1d_padding: 1
|
||||
dropout: 0.1
|
||||
transformer_head: 4
|
||||
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 0.1
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 500
|
||||
use_gpu: True
|
||||
use_data_parallel: False
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
transtts_path: ../transformerTTS/checkpoint
|
||||
transformer_step: 20
|
||||
log_dir: ./log
|
|
@ -0,0 +1,150 @@
|
|||
import numpy as np
|
||||
import math
|
||||
import utils
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.layers import Conv1D
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||
|
||||
|
||||
|
||||
class FFTBlock(dg.Layer):
|
||||
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
|
||||
super(FFTBlock, self).__init__()
|
||||
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout)
|
||||
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
|
||||
|
||||
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
||||
"""
|
||||
Feed Forward Transformer block in FastSpeech.
|
||||
|
||||
Args:
|
||||
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
|
||||
T means the timesteps of input.
|
||||
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
|
||||
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
|
||||
len_q means the sequence length of query, len_k means the sequence length of key.
|
||||
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
|
||||
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
|
||||
"""
|
||||
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
||||
output *= non_pad_mask
|
||||
|
||||
output = self.pos_ffn(output)
|
||||
output *= non_pad_mask
|
||||
|
||||
return output, slf_attn
|
||||
|
||||
|
||||
class LengthRegulator(dg.Layer):
|
||||
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
||||
super(LengthRegulator, self).__init__()
|
||||
self.duration_predictor = DurationPredictor(input_size=input_size,
|
||||
out_channels=out_channels,
|
||||
filter_size=filter_size,
|
||||
dropout=dropout)
|
||||
|
||||
def LR(self, x, duration_predictor_output, alpha=1.0):
|
||||
output = []
|
||||
batch_size = x.shape[0]
|
||||
for i in range(batch_size):
|
||||
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
|
||||
output = self.pad(output)
|
||||
return output
|
||||
|
||||
def pad(self, input_ele):
|
||||
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
|
||||
out_list = []
|
||||
for i in range(len(input_ele)):
|
||||
pad_len = max_len - input_ele[i].shape[0]
|
||||
one_batch_padded = layers.pad(
|
||||
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
|
||||
out_list.append(one_batch_padded)
|
||||
out_padded = layers.stack(out_list)
|
||||
return out_padded
|
||||
|
||||
def expand(self, batch, predicted, alpha):
|
||||
out = []
|
||||
time_steps = batch.shape[1]
|
||||
fertilities = predicted.numpy()
|
||||
batch = layers.squeeze(batch,[0])
|
||||
|
||||
|
||||
for i in range(time_steps):
|
||||
if fertilities[0,i]==0:
|
||||
continue
|
||||
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
|
||||
out = layers.concat(out, axis=0)
|
||||
return out
|
||||
|
||||
|
||||
def forward(self, x, alpha=1.0, target=None):
|
||||
"""
|
||||
Length Regulator block in FastSpeech.
|
||||
|
||||
Args:
|
||||
x (Variable): Shape(B, T, C), dtype: float32. The encoder output.
|
||||
alpha (Constant): dtype: float32. The hyperparameter to determine the length of
|
||||
the expanded sequence mel, thereby controlling the voice speed.
|
||||
target (Variable): (Variable, optional): Shape(B, T_text),
|
||||
dtype: int64. The duration of phoneme compute from pretrained transformerTTS.
|
||||
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the output after exppand.
|
||||
duration_predictor_output (Variable), Shape(B, T, C), the output of duration predictor.
|
||||
"""
|
||||
duration_predictor_output = self.duration_predictor(x)
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
output = self.LR(x, target)
|
||||
return output, duration_predictor_output
|
||||
else:
|
||||
duration_predictor_output = layers.round(duration_predictor_output)
|
||||
output = self.LR(x, duration_predictor_output, alpha)
|
||||
mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])])
|
||||
return output, mel_pos
|
||||
|
||||
class DurationPredictor(dg.Layer):
|
||||
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
|
||||
super(DurationPredictor, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.out_channels = out_channels
|
||||
self.filter_size = filter_size
|
||||
self.dropout = dropout
|
||||
|
||||
self.conv1 = Conv1D(in_channels = self.input_size,
|
||||
out_channels = self.out_channels,
|
||||
filter_size = self.filter_size,
|
||||
padding=1,
|
||||
data_format='NTC')
|
||||
self.conv2 = Conv1D(in_channels = self.out_channels,
|
||||
out_channels = self.out_channels,
|
||||
filter_size = self.filter_size,
|
||||
padding=1,
|
||||
data_format='NTC')
|
||||
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
||||
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
||||
|
||||
self.linear =dg.Linear(self.out_channels, 1)
|
||||
|
||||
def forward(self, encoder_output):
|
||||
"""
|
||||
Duration Predictor block in FastSpeech.
|
||||
|
||||
Args:
|
||||
encoder_output (Variable): Shape(B, T, C), dtype: float32. The encoder output.
|
||||
Returns:
|
||||
out (Variable), Shape(B, T, C), the output of duration predictor.
|
||||
"""
|
||||
# encoder_output.shape(N, T, C)
|
||||
out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout)
|
||||
out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout)
|
||||
out = layers.relu(self.linear(out))
|
||||
out = layers.squeeze(out, axes=[-1])
|
||||
|
||||
return out
|
||||
|
||||
|
|
@ -0,0 +1,214 @@
|
|||
from utils import *
|
||||
from modules import *
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.post_convnet import PostConvNet
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self,
|
||||
n_src_vocab,
|
||||
len_max_seq,
|
||||
d_word_vec,
|
||||
n_layers,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
d_model,
|
||||
d_inner,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=0.1):
|
||||
super(Encoder, self).__init__()
|
||||
n_position = len_max_seq + 1
|
||||
|
||||
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0)
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
def forward(self, character, text_pos):
|
||||
"""
|
||||
Encoder layer of FastSpeech.
|
||||
|
||||
Args:
|
||||
character (Variable): Shape(B, T_text), dtype: float32. The input text
|
||||
characters. T_text means the timesteps of input characters.
|
||||
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
|
||||
position. T_text means the timesteps of input characters.
|
||||
|
||||
Returns:
|
||||
enc_output (Variable), Shape(B, text_T, C), the encoder output.
|
||||
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
|
||||
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
|
||||
"""
|
||||
enc_slf_attn_list = []
|
||||
# -- prepare masks
|
||||
# shape character (N, T)
|
||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
|
||||
non_pad_mask = get_non_pad_mask(character)
|
||||
|
||||
# -- Forward
|
||||
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
|
||||
|
||||
for enc_layer in self.layer_stack:
|
||||
enc_output, enc_slf_attn = enc_layer(
|
||||
enc_output,
|
||||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
enc_slf_attn_list += [enc_slf_attn]
|
||||
|
||||
return enc_output, non_pad_mask, enc_slf_attn_list
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self,
|
||||
len_max_seq,
|
||||
d_word_vec,
|
||||
n_layers,
|
||||
n_head,
|
||||
d_k,
|
||||
d_v,
|
||||
d_model,
|
||||
d_inner,
|
||||
fft_conv1d_kernel,
|
||||
fft_conv1d_padding,
|
||||
dropout=0.1):
|
||||
super(Decoder, self).__init__()
|
||||
|
||||
n_position = len_max_seq + 1
|
||||
self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0)
|
||||
self.position_enc = dg.Embedding(size=[n_position, d_word_vec],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
|
||||
for i, layer in enumerate(self.layer_stack):
|
||||
self.add_sublayer('fft_{}'.format(i), layer)
|
||||
|
||||
def forward(self, enc_seq, enc_pos):
|
||||
"""
|
||||
Decoder layer of FastSpeech.
|
||||
|
||||
Args:
|
||||
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
|
||||
The output of length regulator.
|
||||
enc_pos (Variable, optional): Shape(B, T_mel),
|
||||
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
|
||||
Returns:
|
||||
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
|
||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||
"""
|
||||
dec_slf_attn_list = []
|
||||
|
||||
# -- Prepare masks
|
||||
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
|
||||
non_pad_mask = get_non_pad_mask(enc_pos)
|
||||
|
||||
# -- Forward
|
||||
dec_output = enc_seq + self.position_enc(enc_pos)
|
||||
|
||||
for dec_layer in self.layer_stack:
|
||||
dec_output, dec_slf_attn = dec_layer(
|
||||
dec_output,
|
||||
non_pad_mask=non_pad_mask,
|
||||
slf_attn_mask=slf_attn_mask)
|
||||
dec_slf_attn_list += [dec_slf_attn]
|
||||
|
||||
return dec_output, dec_slf_attn_list
|
||||
|
||||
class FastSpeech(dg.Layer):
|
||||
def __init__(self, cfg):
|
||||
" FastSpeech"
|
||||
super(FastSpeech, self).__init__()
|
||||
|
||||
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
|
||||
len_max_seq=cfg.max_sep_len,
|
||||
d_word_vec=cfg.embedding_size,
|
||||
n_layers=cfg.encoder_n_layer,
|
||||
n_head=cfg.encoder_head,
|
||||
d_k=64,
|
||||
d_v=64,
|
||||
d_model=cfg.hidden_size,
|
||||
d_inner=cfg.encoder_conv1d_filter_size,
|
||||
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||
dropout=0.1)
|
||||
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size,
|
||||
out_channels=cfg.duration_predictor_output_size,
|
||||
filter_size=cfg.duration_predictor_filter_size,
|
||||
dropout=cfg.dropout)
|
||||
self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
|
||||
d_word_vec=cfg.embedding_size,
|
||||
n_layers=cfg.decoder_n_layer,
|
||||
n_head=cfg.decoder_head,
|
||||
d_k=64,
|
||||
d_v=64,
|
||||
d_model=cfg.hidden_size,
|
||||
d_inner=cfg.decoder_conv1d_filter_size,
|
||||
fft_conv1d_kernel=cfg.fft_conv1d_filter,
|
||||
fft_conv1d_padding=cfg.fft_conv1d_padding,
|
||||
dropout=0.1)
|
||||
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels)
|
||||
self.postnet = PostConvNet(n_mels=80,
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
padding=int(5 / 2),
|
||||
num_conv=5,
|
||||
outputs_per_step=1,
|
||||
use_cudnn=True,
|
||||
dropout=0.1)
|
||||
|
||||
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
|
||||
"""
|
||||
FastSpeech model.
|
||||
|
||||
Args:
|
||||
character (Variable): Shape(B, T_text), dtype: float32. The input text
|
||||
characters. T_text means the timesteps of input characters.
|
||||
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
|
||||
position. T_text means the timesteps of input characters.
|
||||
mel_pos (Variable, optional): Shape(B, T_mel),
|
||||
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
|
||||
length_target (Variable, optional): Shape(B, T_text),
|
||||
dtype: int64. The duration of phoneme compute from pretrained transformerTTS.
|
||||
alpha (Constant):
|
||||
dtype: float32. The hyperparameter to determine the length of the expanded sequence
|
||||
mel, thereby controlling the voice speed.
|
||||
|
||||
Returns:
|
||||
mel_output (Variable), Shape(B, mel_T, C), the mel output before postnet.
|
||||
mel_output_postnet (Variable), Shape(B, mel_T, C), the mel output after postnet.
|
||||
duration_predictor_output (Variable), Shape(B, text_T), the duration of phoneme compute
|
||||
with duration predictor.
|
||||
enc_slf_attn_list (Variable), Shape(B, text_T, text_T), the encoder self attention list.
|
||||
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
|
||||
"""
|
||||
|
||||
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
|
||||
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
|
||||
target=length_target,
|
||||
alpha=alpha)
|
||||
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
|
||||
|
||||
mel_output = self.mel_linear(decoder_output)
|
||||
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||
|
||||
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
|
||||
else:
|
||||
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
|
||||
decoder_output = self.decoder(length_regulator_output, decoder_pos)
|
||||
|
||||
mel_output = self.mel_linear(decoder_output)
|
||||
mel_output_postnet = self.postnet(mel_output) + mel_output
|
||||
|
||||
return mel_output, mel_output_postnet
|
|
@ -0,0 +1,93 @@
|
|||
import jsonargparse
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--audio.num_mels', type=int, default=80,
|
||||
help="the number of mel bands when calculating mel spectrograms.")
|
||||
parser.add_argument('--audio.n_fft', type=int, default=2048,
|
||||
help="the number of fft components.")
|
||||
parser.add_argument('--audio.sr', type=int, default=22050,
|
||||
help="the sampling rate of audio data file.")
|
||||
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
|
||||
help="the preemphasis coefficient.")
|
||||
parser.add_argument('--audio.hop_length', type=int, default=128,
|
||||
help="the number of samples to advance between frames.")
|
||||
parser.add_argument('--audio.win_length', type=int, default=1024,
|
||||
help="the length (width) of the window function.")
|
||||
parser.add_argument('--audio.power', type=float, default=1.4,
|
||||
help="the power to raise before griffin-lim.")
|
||||
parser.add_argument('--audio.min_level_db', type=int, default=-100,
|
||||
help="the minimum level db.")
|
||||
parser.add_argument('--audio.ref_level_db', type=int, default=20,
|
||||
help="the reference level db.")
|
||||
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||
help="the outputs per step.")
|
||||
|
||||
parser.add_argument('--embedding_size', type=int, default=256,
|
||||
help="the dim size of embedding.")
|
||||
parser.add_argument('--encoder_n_layer', type=int, default=6,
|
||||
help="the number of FFT Block in encoder.")
|
||||
parser.add_argument('--encoder_head', type=int, default=2,
|
||||
help="the attention head number in encoder.")
|
||||
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
|
||||
help="the filter size of conv1d in encoder.")
|
||||
parser.add_argument('--max_sep_len', type=int, default=2048,
|
||||
help="the max length of sequence.")
|
||||
parser.add_argument('--encoder_output_size', type=int, default=256,
|
||||
help="the output channel size of encoder.")
|
||||
parser.add_argument('--decoder_n_layer', type=int, default=6,
|
||||
help="the number of FFT Block in decoder.")
|
||||
parser.add_argument('--decoder_head', type=int, default=2,
|
||||
help="the attention head number in decoder.")
|
||||
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
|
||||
help="the filter size of conv1d in decoder.")
|
||||
parser.add_argument('--decoder_output_size', type=int, default=256,
|
||||
help="the output channel size of decoder.")
|
||||
parser.add_argument('--hidden_size', type=int, default=256,
|
||||
help="the hidden size in model.")
|
||||
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
|
||||
help="the output size of duration predictior.")
|
||||
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
|
||||
help="the filter size of conv1d in duration prediction.")
|
||||
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
|
||||
help="the filter size of conv1d in fft.")
|
||||
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
|
||||
help="the padding size of conv1d in fft.")
|
||||
parser.add_argument('--dropout', type=float, default=0.1,
|
||||
help="the dropout in network.")
|
||||
parser.add_argument('--transformer_head', type=int, default=4,
|
||||
help="the attention head num of transformerTTS.")
|
||||
|
||||
parser.add_argument('--warm_up_step', type=int, default=4000,
|
||||
help="the warm up step of learning rate.")
|
||||
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
|
||||
help="the threshold of grad clip.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=bool, default=False,
|
||||
help="use data parallel or not during training.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./sample',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
parser.add_argument('--transtts_path', type=str, default='./log',
|
||||
help="the directory to load pretrain transformerTTS model.")
|
||||
parser.add_argument('--transformer_step', type=int, default=70000,
|
||||
help="the step to load transformerTTS model.")
|
||||
|
||||
|
||||
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
|
|
@ -0,0 +1,139 @@
|
|||
import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
import jsonargparse
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
from tensorboardX import SummaryWriter
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from network import FastSpeech
|
||||
from utils import get_alignment
|
||||
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
|
||||
from parakeet.models.transformerTTS.network import TransformerTTS
|
||||
|
||||
class MyDataParallel(dg.parallel.DataParallel):
|
||||
"""
|
||||
A data parallel proxy for model.
|
||||
"""
|
||||
|
||||
def __init__(self, layers, strategy):
|
||||
super(MyDataParallel, self).__init__(layers, strategy)
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key in self.__dict__:
|
||||
return object.__getattribute__(self, key)
|
||||
elif key is "_layers":
|
||||
return object.__getattribute__(self, "_sub_layers")["_layers"]
|
||||
else:
|
||||
return getattr(
|
||||
object.__getattribute__(self, "_sub_layers")["_layers"], key)
|
||||
|
||||
def main(cfg):
|
||||
|
||||
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||
|
||||
if local_rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||
|
||||
global_step = 0
|
||||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if cfg.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'fastspeech')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
transformerTTS = TransformerTTS(cfg)
|
||||
model_path = os.path.join(cfg.transtts_path, "transformer")
|
||||
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step)))
|
||||
#for param in transformerTTS.state_dict():
|
||||
# print(param)
|
||||
|
||||
transformerTTS.set_dict(model_dict)
|
||||
transformerTTS.eval()
|
||||
|
||||
model = FastSpeech(cfg)
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
parameter_list=model.parameters())
|
||||
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
|
||||
|
||||
if cfg.checkpoint_path is not None:
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
print("load checkpoint!!!")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = MyDataParallel(model, strategy)
|
||||
|
||||
for epoch in range(cfg.epochs):
|
||||
pbar = tqdm(reader)
|
||||
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
character, mel, mel_input, pos_text, pos_mel, text_length = data
|
||||
|
||||
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
|
||||
alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32)
|
||||
|
||||
global_step += 1
|
||||
|
||||
#Forward
|
||||
result= model(character,
|
||||
pos_text,
|
||||
mel_pos=pos_mel,
|
||||
length_target=alignment)
|
||||
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
|
||||
mel_loss = layers.mse_loss(mel_output, mel)
|
||||
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
|
||||
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
|
||||
total_loss = mel_loss + mel_postnet_loss + duration_loss
|
||||
|
||||
if local_rank==0:
|
||||
print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
|
||||
|
||||
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
|
||||
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
|
||||
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
|
||||
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
total_loss = model.scale_loss(total_loss)
|
||||
total_loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
total_loss.backward()
|
||||
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank==0 and global_step % cfg.save_step == 0:
|
||||
if not os.path.exists(cfg.save_path):
|
||||
os.mkdir(cfg.save_path)
|
||||
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
if local_rank==0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ =='__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse')
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c config/fastspeech.yaml'.split())
|
||||
main(cfg)
|
|
@ -0,0 +1,32 @@
|
|||
import numpy as np
|
||||
|
||||
def get_alignment(attn_probs, n_head):
|
||||
max_F = 0
|
||||
assert attn_probs[0].shape[0] % n_head == 0
|
||||
batch_size = int(attn_probs[0].shape[0] // n_head)
|
||||
for i in range(len(attn_probs)):
|
||||
multi_attn = attn_probs[i].numpy()
|
||||
for j in range(n_head):
|
||||
attn = multi_attn[j*batch_size:(j+1)*batch_size]
|
||||
F = score_F(attn)
|
||||
if max_F < F:
|
||||
max_F = F
|
||||
max_attn = attn
|
||||
alignment = compute_duration(max_attn)
|
||||
return alignment
|
||||
|
||||
def score_F(attn):
|
||||
max = np.max(attn, axis=-1)
|
||||
mean = np.mean(max)
|
||||
return mean
|
||||
|
||||
def compute_duration(attn):
|
||||
alignment = np.zeros([attn.shape[0],attn.shape[2]])
|
||||
for i in range(attn.shape[0]):
|
||||
for j in range(attn.shape[1]):
|
||||
max_index = attn[i,j].tolist().index(attn[i,j].max())
|
||||
alignment[i,max_index] += 1
|
||||
|
||||
return alignment
|
||||
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
max_len: 50
|
||||
transformer_step: 1
|
||||
postnet_step: 1
|
||||
use_gpu: True
|
||||
|
||||
checkpoint_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
sample_path: ./sample
|
|
@ -0,0 +1,27 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 500
|
||||
use_gpu: True
|
||||
use_data_parallel: True
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
|
@ -0,0 +1,33 @@
|
|||
audio:
|
||||
num_mels: 80
|
||||
n_fft: 2048
|
||||
sr: 22050
|
||||
preemphasis: 0.97
|
||||
hop_length: 275
|
||||
win_length: 1102
|
||||
power: 1.2
|
||||
min_level_db: -100
|
||||
ref_level_db: 20
|
||||
outputs_per_step: 1
|
||||
|
||||
|
||||
hidden_size: 256
|
||||
embedding_size: 512
|
||||
|
||||
|
||||
warm_up_step: 4000
|
||||
grad_clip_thresh: 1.0
|
||||
batch_size: 32
|
||||
epochs: 10000
|
||||
lr: 0.001
|
||||
save_step: 1000
|
||||
image_step: 2000
|
||||
use_gpu: True
|
||||
use_data_parallel: False
|
||||
|
||||
data_path: ../../../dataset/LJSpeech-1.1
|
||||
save_path: ./checkpoint
|
||||
log_dir: ./log
|
||||
#checkpoint_path: ./checkpoint/transformer/1
|
||||
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
import math
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
class Conv1D(dg.Layer):
|
||||
"""
|
||||
A convolution 1D block implemented with Conv2D. Form simplicity and
|
||||
ensuring the output has the same length as the input, it does not allow
|
||||
stride > 1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
num_filters,
|
||||
filter_size=3,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
stride=1,
|
||||
groups=None,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
data_format='NCT',
|
||||
dtype="float32"):
|
||||
super(Conv1D, self).__init__(dtype=dtype)
|
||||
|
||||
self.padding = padding
|
||||
self.in_channels = in_channels
|
||||
self.num_filters = num_filters
|
||||
self.filter_size = filter_size
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.padding = padding
|
||||
self.act = act
|
||||
self.data_format = data_format
|
||||
|
||||
self.conv = dg.Conv2D(
|
||||
in_channels=in_channels,
|
||||
num_filters=num_filters,
|
||||
filter_size=(1, filter_size),
|
||||
stride=(1, stride),
|
||||
dilation=(1, dilation),
|
||||
padding=(0, padding),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
x = fluid.layers.unsqueeze(x, [2])
|
||||
x = self.conv(x)
|
||||
x = fluid.layers.squeeze(x, [2])
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
||||
|
||||
class Pool1D(dg.Layer):
|
||||
"""
|
||||
A Pool 1D block implemented with Pool2D.
|
||||
"""
|
||||
def __init__(self,
|
||||
pool_size=-1,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=0,
|
||||
global_pooling=False,
|
||||
use_cudnn=True,
|
||||
ceil_mode=False,
|
||||
exclusive=True,
|
||||
data_format='NCT',
|
||||
dtype='float32'):
|
||||
super(Pool1D, self).__init__(dtype=dtype)
|
||||
self.pool_size = pool_size
|
||||
self.pool_type = pool_type
|
||||
self.pool_stride = pool_stride
|
||||
self.pool_padding = pool_padding
|
||||
self.global_pooling = global_pooling
|
||||
self.use_cudnn = use_cudnn
|
||||
self.ceil_mode = ceil_mode
|
||||
self.exclusive = exclusive
|
||||
self.data_format = data_format
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
||||
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
||||
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
||||
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
x = fluid.layers.unsqueeze(x, [2])
|
||||
x = self.pool2d(x)
|
||||
x = fluid.layers.squeeze(x, [2])
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
||||
|
||||
class DynamicGRU(dg.Layer):
|
||||
def __init__(self,
|
||||
size,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
is_reverse=False,
|
||||
gate_activation='sigmoid',
|
||||
candidate_activation='tanh',
|
||||
h_0=None,
|
||||
origin_mode=False,
|
||||
init_size=None):
|
||||
super(DynamicGRU, self).__init__()
|
||||
self.gru_unit = dg.GRUUnit(
|
||||
size * 3,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
activation=candidate_activation,
|
||||
gate_activation=gate_activation,
|
||||
origin_mode=origin_mode)
|
||||
self.size = size
|
||||
self.h_0 = h_0
|
||||
self.is_reverse = is_reverse
|
||||
|
||||
def forward(self, inputs):
|
||||
hidden = self.h_0
|
||||
res = []
|
||||
for i in range(inputs.shape[1]):
|
||||
if self.is_reverse:
|
||||
i = inputs.shape[1] - 1 - i
|
||||
input_ = inputs[:, i:i + 1, :]
|
||||
input_ = fluid.layers.reshape(
|
||||
input_, [-1, input_.shape[2]], inplace=False)
|
||||
hidden, reset, gate = self.gru_unit(input_, hidden)
|
||||
hidden_ = fluid.layers.reshape(
|
||||
hidden, [-1, 1, hidden.shape[1]], inplace=False)
|
||||
res.append(hidden_)
|
||||
if self.is_reverse:
|
||||
res = res[::-1]
|
||||
res = fluid.layers.concat(res, axis=1)
|
||||
return res
|
||||
|
|
@ -0,0 +1,242 @@
|
|||
import math
|
||||
from parakeet.g2p.text.symbols import symbols
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.layers import Conv, Pool1D
|
||||
from parakeet.modules.dynamicGRU import DynamicGRU
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
class EncoderPrenet(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
|
||||
super(EncoderPrenet, self).__init__()
|
||||
self.embedding_size = embedding_size
|
||||
self.num_hidden = num_hidden
|
||||
self.use_cudnn = use_cudnn
|
||||
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
padding_idx = None)
|
||||
self.conv_list = []
|
||||
self.conv_list.append(Conv(in_channels = embedding_size,
|
||||
out_channels = num_hidden,
|
||||
filter_size = 5,
|
||||
padding = int(np.floor(5/2)),
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT"))
|
||||
for _ in range(2):
|
||||
self.conv_list.append(Conv(in_channels = num_hidden,
|
||||
out_channels = num_hidden,
|
||||
filter_size = 5,
|
||||
padding = int(np.floor(5/2)),
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT"))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
moving_mean_name = 'moving_mean',
|
||||
moving_variance_name = 'moving_var',
|
||||
data_layout='NCHW') for _ in range(3)]
|
||||
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
self.projection = dg.Linear(num_hidden, num_hidden)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x) #(batch_size, seq_len, embending_size)
|
||||
x = layers.transpose(x,[0,2,1])
|
||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
|
||||
x = layers.transpose(x,[0,2,1]) #(N,T,C)
|
||||
x = self.projection(x)
|
||||
return x
|
||||
|
||||
class CBHG(dg.Layer):
|
||||
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
|
||||
max_pool_kernel_size=2, is_post=False):
|
||||
super(CBHG, self).__init__()
|
||||
"""
|
||||
:param hidden_size: dimension of hidden unit
|
||||
:param K: # of convolution banks
|
||||
:param projection_size: dimension of projection unit
|
||||
:param num_gru_layers: # of layers of GRUcell
|
||||
:param max_pool_kernel_size: max pooling kernel size
|
||||
:param is_post: whether post processing or not
|
||||
"""
|
||||
self.hidden_size = hidden_size
|
||||
self.projection_size = projection_size
|
||||
self.conv_list = []
|
||||
self.conv_list.append(Conv(in_channels = projection_size,
|
||||
out_channels = hidden_size,
|
||||
filter_size = 1,
|
||||
padding = int(np.floor(1/2)),
|
||||
data_format = "NCT"))
|
||||
for i in range(2,K+1):
|
||||
self.conv_list.append(Conv(in_channels = hidden_size,
|
||||
out_channels = hidden_size,
|
||||
filter_size = i,
|
||||
padding = int(np.floor(i/2)),
|
||||
data_format = "NCT"))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batchnorm_list = []
|
||||
for i in range(K):
|
||||
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
moving_mean_name = 'moving_mean',
|
||||
moving_variance_name = 'moving_var',
|
||||
data_layout='NCHW'))
|
||||
|
||||
for i, layer in enumerate(self.batchnorm_list):
|
||||
self.add_sublayer("batchnorm_list_{}".format(i), layer)
|
||||
|
||||
conv_outdim = hidden_size * K
|
||||
|
||||
self.conv_projection_1 = Conv(in_channels = conv_outdim,
|
||||
out_channels = hidden_size,
|
||||
filter_size = 3,
|
||||
padding = int(np.floor(3/2)),
|
||||
data_format = "NCT")
|
||||
|
||||
self.conv_projection_2 = Conv(in_channels = hidden_size,
|
||||
out_channels = projection_size,
|
||||
filter_size = 3,
|
||||
padding = int(np.floor(3/2)),
|
||||
data_format = "NCT")
|
||||
|
||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
moving_mean_name = 'moving_mean',
|
||||
moving_variance_name = 'moving_var',
|
||||
data_layout='NCHW')
|
||||
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
moving_mean_name = 'moving_mean',
|
||||
moving_variance_name = 'moving_var',
|
||||
data_layout='NCHW')
|
||||
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=1,
|
||||
data_format = "NCT")
|
||||
self.highway = Highwaynet(self.projection_size)
|
||||
|
||||
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
|
||||
h_0 = dg.to_variable(h_0)
|
||||
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
is_reverse = False,
|
||||
origin_mode = True,
|
||||
h_0 = h_0)
|
||||
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0 = h_0)
|
||||
|
||||
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3)
|
||||
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
is_reverse = False,
|
||||
origin_mode = True,
|
||||
h_0 = h_0)
|
||||
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
is_reverse=True,
|
||||
origin_mode=True,
|
||||
h_0 = h_0)
|
||||
|
||||
def _conv_fit_dim(self, x, filter_size=3):
|
||||
if filter_size % 2 == 0:
|
||||
return x[:,:,:-1]
|
||||
else:
|
||||
return x
|
||||
|
||||
def forward(self, input_):
|
||||
# input_.shape = [N, C, T]
|
||||
|
||||
conv_list = []
|
||||
conv_input = input_
|
||||
|
||||
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
|
||||
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
|
||||
conv_input = layers.relu(batchnorm(conv_input))
|
||||
conv_list.append(conv_input)
|
||||
|
||||
conv_cat = layers.concat(conv_list, axis=1)
|
||||
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
|
||||
|
||||
|
||||
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
|
||||
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
|
||||
|
||||
# conv_proj.shape = [N, C, T]
|
||||
highway = layers.transpose(conv_proj, [0,2,1])
|
||||
highway = self.highway(highway)
|
||||
|
||||
# highway.shape = [N, T, C]
|
||||
fc_forward = self.fc_forward1(highway)
|
||||
fc_reverse = self.fc_reverse1(highway)
|
||||
out_forward = self.gru_forward1(fc_forward)
|
||||
out_reverse = self.gru_reverse1(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
fc_forward = self.fc_forward2(out)
|
||||
fc_reverse = self.fc_reverse2(out)
|
||||
out_forward = self.gru_forward2(fc_forward)
|
||||
out_reverse = self.gru_reverse2(fc_reverse)
|
||||
out = layers.concat([out_forward, out_reverse], axis=-1)
|
||||
out = layers.transpose(out, [0,2,1])
|
||||
return out
|
||||
|
||||
class Highwaynet(dg.Layer):
|
||||
def __init__(self, num_units, num_layers=4):
|
||||
super(Highwaynet, self).__init__()
|
||||
self.num_units = num_units
|
||||
self.num_layers = num_layers
|
||||
|
||||
self.gates = []
|
||||
self.linears = []
|
||||
|
||||
for i in range(num_layers):
|
||||
self.linears.append(dg.Linear(num_units, num_units))
|
||||
self.gates.append(dg.Linear(num_units, num_units))
|
||||
|
||||
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
|
||||
self.add_sublayer("linears_{}".format(i), linear)
|
||||
self.add_sublayer("gates_{}".format(i), gate)
|
||||
|
||||
def forward(self, input_):
|
||||
out = input_
|
||||
|
||||
for linear, gate in zip(self.linears, self.gates):
|
||||
h = fluid.layers.relu(linear(out))
|
||||
t_ = fluid.layers.sigmoid(gate(out))
|
||||
|
||||
c = 1 - t_
|
||||
out = h * t_ + out * c
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,206 @@
|
|||
from parakeet.models.transformerTTS.module import *
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
from parakeet.modules.layers import Conv1D
|
||||
from parakeet.modules.utils import *
|
||||
from parakeet.modules.multihead_attention import MultiheadAttention
|
||||
from parakeet.modules.feed_forward import PositionwiseFeedForward
|
||||
from parakeet.modules.prenet import PreNet
|
||||
from parakeet.modules.post_convnet import PostConvNet
|
||||
|
||||
|
||||
class Encoder(dg.Layer):
|
||||
def __init__(self, embedding_size, num_hidden, config):
|
||||
super(Encoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr(name='alpha',
|
||||
initializer=fluid.initializer.Constant(value=1.0))
|
||||
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
|
||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
|
||||
num_hidden = num_hidden,
|
||||
use_cudnn=config.use_gpu)
|
||||
self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
|
||||
for i, layer in enumerate(self.layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
|
||||
def forward(self, x, positional):
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
query_mask = get_non_pad_mask(positional)
|
||||
mask = get_attn_key_pad_mask(positional, x)
|
||||
else:
|
||||
query_mask, mask = None, None
|
||||
|
||||
|
||||
# Encoder pre_network
|
||||
x = self.encoder_prenet(x) #(N,T,C)
|
||||
|
||||
|
||||
# Get positional encoding
|
||||
positional = self.pos_emb(positional)
|
||||
x = positional * self.alpha + x #(N, T, C)
|
||||
|
||||
|
||||
# Positional dropout
|
||||
x = layers.dropout(x, 0.1)
|
||||
|
||||
# Self attention encoder
|
||||
attentions = list()
|
||||
for layer, ffn in zip(self.layers, self.ffns):
|
||||
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
|
||||
x = ffn(x)
|
||||
attentions.append(attention)
|
||||
|
||||
return x, query_mask, attentions
|
||||
|
||||
class Decoder(dg.Layer):
|
||||
def __init__(self, num_hidden, config):
|
||||
super(Decoder, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
param = fluid.ParamAttr(name='alpha')
|
||||
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
|
||||
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
|
||||
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
|
||||
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
|
||||
padding_idx=0,
|
||||
param_attr=fluid.ParamAttr(
|
||||
name='weight',
|
||||
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
|
||||
trainable=False))
|
||||
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
|
||||
hidden_size = num_hidden * 2,
|
||||
output_size = num_hidden,
|
||||
dropout_rate=0.2)
|
||||
self.linear = dg.Linear(num_hidden, num_hidden)
|
||||
|
||||
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
|
||||
for i, layer in enumerate(self.selfattn_layers):
|
||||
self.add_sublayer("self_attn_{}".format(i), layer)
|
||||
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
|
||||
for i, layer in enumerate(self.attn_layers):
|
||||
self.add_sublayer("attn_{}".format(i), layer)
|
||||
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
|
||||
for i, layer in enumerate(self.ffns):
|
||||
self.add_sublayer("ffns_{}".format(i), layer)
|
||||
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
|
||||
self.stop_linear = dg.Linear(num_hidden, 1)
|
||||
|
||||
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
|
||||
filter_size = 5, padding = 4, num_conv=5,
|
||||
outputs_per_step=config.audio.outputs_per_step,
|
||||
use_cudnn = config.use_gpu)
|
||||
|
||||
def forward(self, key, value, query, c_mask, positional):
|
||||
|
||||
# get decoder mask with triangular matrix
|
||||
|
||||
if fluid.framework._dygraph_tracer()._train_mode:
|
||||
m_mask = get_non_pad_mask(positional)
|
||||
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
|
||||
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
|
||||
mask = mask + triu_tensor
|
||||
mask = fluid.layers.cast(mask == 0, np.float32)
|
||||
|
||||
# (batch_size, decoder_len, encoder_len)
|
||||
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
|
||||
else:
|
||||
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
|
||||
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
|
||||
m_mask, zero_mask = None, None
|
||||
|
||||
# Decoder pre-network
|
||||
query = self.decoder_prenet(query)
|
||||
|
||||
# Centered position
|
||||
query = self.linear(query)
|
||||
|
||||
# Get position embedding
|
||||
positional = self.pos_emb(positional)
|
||||
query = positional * self.alpha + query
|
||||
|
||||
#positional dropout
|
||||
query = fluid.layers.dropout(query, 0.1)
|
||||
|
||||
# Attention decoder-decoder, encoder-decoder
|
||||
selfattn_list = list()
|
||||
attn_list = list()
|
||||
|
||||
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
|
||||
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
|
||||
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
|
||||
query = ffn(query)
|
||||
selfattn_list.append(attn_dec)
|
||||
attn_list.append(attn_dot)
|
||||
|
||||
# Mel linear projection
|
||||
mel_out = self.mel_linear(query)
|
||||
# Post Mel Network
|
||||
out = self.postconvnet(mel_out)
|
||||
out = mel_out + out
|
||||
|
||||
# Stop tokens
|
||||
stop_tokens = self.stop_linear(query)
|
||||
stop_tokens = layers.squeeze(stop_tokens, [-1])
|
||||
stop_tokens = layers.sigmoid(stop_tokens)
|
||||
|
||||
return mel_out, out, attn_list, stop_tokens, selfattn_list
|
||||
|
||||
class TransformerTTS(dg.Layer):
|
||||
def __init__(self, config):
|
||||
super(TransformerTTS, self).__init__()
|
||||
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
|
||||
self.decoder = Decoder(config.hidden_size, config)
|
||||
self.config = config
|
||||
|
||||
def forward(self, characters, mel_input, pos_text, pos_mel):
|
||||
# key (batch_size, seq_len, channel)
|
||||
# c_mask (batch_size, seq_len)
|
||||
# attns_enc (channel / 2, seq_len, seq_len)
|
||||
|
||||
key, c_mask, attns_enc = self.encoder(characters, pos_text)
|
||||
|
||||
# mel_output/postnet_output (batch_size, mel_len, n_mel)
|
||||
# attn_probs (128, mel_len, seq_len)
|
||||
# stop_preds (batch_size, mel_len, 1)
|
||||
# attns_dec (128, mel_len, mel_len)
|
||||
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
|
||||
|
||||
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
|
||||
|
||||
class ModelPostNet(dg.Layer):
|
||||
"""
|
||||
CBHG Network (mel -> linear)
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(ModelPostNet, self).__init__()
|
||||
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
|
||||
out_channels = config.hidden_size,
|
||||
filter_size=1,
|
||||
data_format = "NCT")
|
||||
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
||||
self.post_proj = Conv1D(in_channels = config.hidden_size,
|
||||
out_channels = (config.audio.n_fft // 2) + 1,
|
||||
filter_size=1,
|
||||
data_format = "NCT")
|
||||
|
||||
def forward(self, mel):
|
||||
mel = layers.transpose(mel, [0,2,1])
|
||||
mel = self.pre_proj(mel)
|
||||
mel = self.cbhg(mel)
|
||||
mag_pred = self.post_proj(mel)
|
||||
mag_pred = layers.transpose(mag_pred, [0,2,1])
|
||||
return mag_pred
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
import jsonargparse
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--audio.num_mels', type=int, default=80,
|
||||
help="the number of mel bands when calculating mel spectrograms.")
|
||||
parser.add_argument('--audio.n_fft', type=int, default=2048,
|
||||
help="the number of fft components.")
|
||||
parser.add_argument('--audio.sr', type=int, default=22050,
|
||||
help="the sampling rate of audio data file.")
|
||||
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
|
||||
help="the preemphasis coefficient.")
|
||||
parser.add_argument('--audio.hop_length', type=int, default=128,
|
||||
help="the number of samples to advance between frames.")
|
||||
parser.add_argument('--audio.win_length', type=int, default=1024,
|
||||
help="the length (width) of the window function.")
|
||||
parser.add_argument('--audio.power', type=float, default=1.4,
|
||||
help="the power to raise before griffin-lim.")
|
||||
parser.add_argument('--audio.min_level_db', type=int, default=-100,
|
||||
help="the minimum level db.")
|
||||
parser.add_argument('--audio.ref_level_db', type=int, default=20,
|
||||
help="the reference level db.")
|
||||
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
|
||||
help="the outputs per step.")
|
||||
|
||||
parser.add_argument('--hidden_size', type=int, default=256,
|
||||
help="the hidden size in network.")
|
||||
parser.add_argument('--embedding_size', type=int, default=512,
|
||||
help="the embedding vector size.")
|
||||
|
||||
parser.add_argument('--warm_up_step', type=int, default=4000,
|
||||
help="the warm up step of learning rate.")
|
||||
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
|
||||
help="the threshold of grad clip.")
|
||||
parser.add_argument('--batch_size', type=int, default=32,
|
||||
help="batch size for training.")
|
||||
parser.add_argument('--epochs', type=int, default=10000,
|
||||
help="the number of epoch for training.")
|
||||
parser.add_argument('--lr', type=float, default=0.001,
|
||||
help="the learning rate for training.")
|
||||
parser.add_argument('--save_step', type=int, default=500,
|
||||
help="checkpointing interval during training.")
|
||||
parser.add_argument('--image_step', type=int, default=2000,
|
||||
help="attention image interval during training.")
|
||||
parser.add_argument('--max_len', type=int, default=400,
|
||||
help="The max length of audio when synthsis.")
|
||||
parser.add_argument('--transformer_step', type=int, default=160000,
|
||||
help="Global step to restore checkpoint of transformer in synthesis.")
|
||||
parser.add_argument('--postnet_step', type=int, default=100000,
|
||||
help="Global step to restore checkpoint of postnet in synthesis.")
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="use gpu or not during training.")
|
||||
parser.add_argument('--use_data_parallel', type=bool, default=False,
|
||||
help="use data parallel or not during training.")
|
||||
|
||||
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
|
||||
help="the path of dataset.")
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None,
|
||||
help="the path to load checkpoint or pretrain model.")
|
||||
parser.add_argument('--save_path', type=str, default='./checkpoint',
|
||||
help="the path to save checkpoint.")
|
||||
parser.add_argument('--log_dir', type=str, default='./log',
|
||||
help="the directory to save tensorboard log.")
|
||||
parser.add_argument('--sample_path', type=str, default='./log',
|
||||
help="the directory to save audio sample in synthesis.")
|
||||
|
||||
|
||||
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
|
|
@ -0,0 +1,123 @@
|
|||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import librosa
|
||||
|
||||
from parakeet import g2p
|
||||
from parakeet import audio
|
||||
|
||||
from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler
|
||||
from parakeet.data.dataset import Dataset
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.batch import TextIDBatcher, SpecBatcher
|
||||
|
||||
_ljspeech_processor = audio.AudioProcessor(
|
||||
sample_rate=22050,
|
||||
num_mels=80,
|
||||
min_level_db=-100,
|
||||
ref_level_db=20,
|
||||
n_fft=2048,
|
||||
win_length= int(22050 * 0.05),
|
||||
hop_length= int(22050 * 0.0125),
|
||||
power=1.2,
|
||||
preemphasis=0.97,
|
||||
signal_norm=True,
|
||||
symmetric_norm=False,
|
||||
max_norm=1.,
|
||||
mel_fmin=0,
|
||||
mel_fmax=None,
|
||||
clip_norm=True,
|
||||
griffin_lim_iters=60,
|
||||
do_trim_silence=False,
|
||||
sound_norm=False)
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
def __init__(self, root):
|
||||
super(LJSpeech, self).__init__()
|
||||
assert isinstance(root, (str, Path)), "root should be a string or Path object"
|
||||
self.root = root if isinstance(root, Path) else Path(root)
|
||||
self.metadata = self._prepare_metadata()
|
||||
|
||||
def _prepare_metadata(self):
|
||||
csv_path = self.root.joinpath("metadata.csv")
|
||||
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
|
||||
names=["fname", "raw_text", "normalized_text"])
|
||||
return metadata
|
||||
|
||||
def _get_example(self, metadatum):
|
||||
"""All the code for generating an Example from a metadatum. If you want a
|
||||
different preprocessing pipeline, you can override this method.
|
||||
This method may require several processor, each of which has a lot of options.
|
||||
In this case, you'd better pass a composed transform and pass it to the init
|
||||
method.
|
||||
"""
|
||||
|
||||
fname, raw_text, normalized_text = metadatum
|
||||
wav_path = self.root.joinpath("wavs", fname + ".wav")
|
||||
|
||||
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
|
||||
wav = _ljspeech_processor.load_wav(str(wav_path))
|
||||
mag = _ljspeech_processor.spectrogram(wav).astype(np.float32)
|
||||
mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32)
|
||||
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
|
||||
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.metadata.iloc[index]
|
||||
example = self._get_example(metadatum)
|
||||
return example
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(len(self)):
|
||||
yield self[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.metadata)
|
||||
|
||||
|
||||
def batch_examples(batch):
|
||||
texts = []
|
||||
mels = []
|
||||
mel_inputs = []
|
||||
text_lens = []
|
||||
pos_texts = []
|
||||
pos_mels = []
|
||||
for data in batch:
|
||||
_, mel, text = data
|
||||
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
|
||||
text_lens.append(len(text))
|
||||
pos_texts.append(np.arange(1, len(text) + 1))
|
||||
pos_mels.append(np.arange(1, mel.shape[1] + 1))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
|
||||
text_lens = sorted(text_lens, reverse=True)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts = TextIDBatcher(pad_id=0)(texts)
|
||||
pos_texts = TextIDBatcher(pad_id=0)(pos_texts)
|
||||
pos_mels = TextIDBatcher(pad_id=0)(pos_mels)
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
|
||||
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
|
||||
|
||||
def batch_examples_vocoder(batch):
|
||||
mels=[]
|
||||
mags=[]
|
||||
for data in batch:
|
||||
mag, mel, _ = data
|
||||
mels.append(mel)
|
||||
mags.append(mag)
|
||||
|
||||
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
|
||||
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
|
||||
|
||||
return (mels, mags)
|
||||
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
import os
|
||||
from scipy.io.wavfile import write
|
||||
from parakeet.g2p.en import text_to_sequence
|
||||
import numpy as np
|
||||
from network import Model, ModelPostNet
|
||||
from tqdm import tqdm
|
||||
from tensorboardX import SummaryWriter
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
from preprocess import _ljspeech_processor
|
||||
from pathlib import Path
|
||||
import jsonargparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
|
||||
def load_checkpoint(step, model_path):
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
|
||||
return model_dict
|
||||
|
||||
def synthesis(text_input, cfg):
|
||||
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
|
||||
|
||||
# tensorboard
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'synthesis')
|
||||
|
||||
writer = SummaryWriter(path)
|
||||
|
||||
with dg.guard(place):
|
||||
model = Model(cfg)
|
||||
model_postnet = ModelPostNet(cfg)
|
||||
|
||||
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
|
||||
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
|
||||
|
||||
# init input
|
||||
text = np.asarray(text_to_sequence(text_input))
|
||||
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
|
||||
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
|
||||
pos_text = np.arange(1, text.shape[1]+1)
|
||||
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
|
||||
|
||||
|
||||
model.eval()
|
||||
model_postnet.eval()
|
||||
|
||||
pbar = tqdm(range(cfg.max_len))
|
||||
|
||||
for i in pbar:
|
||||
pos_mel = np.arange(1, mel_input.shape[1]+1)
|
||||
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
|
||||
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
|
||||
mag_pred = model_postnet(postnet_pred)
|
||||
|
||||
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
|
||||
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
|
||||
if not os.path.exists(cfg.sample_path):
|
||||
os.mkdir(cfg.sample_path)
|
||||
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
|
||||
synthesis("Transformer model is so fast!", cfg)
|
|
@ -0,0 +1,111 @@
|
|||
from network import *
|
||||
from tensorboardX import SummaryWriter
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
import jsonargparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
|
||||
|
||||
class MyDataParallel(dg.parallel.DataParallel):
|
||||
"""
|
||||
A data parallel proxy for model.
|
||||
"""
|
||||
|
||||
def __init__(self, layers, strategy):
|
||||
super(MyDataParallel, self).__init__(layers, strategy)
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key in self.__dict__:
|
||||
return object.__getattribute__(self, key)
|
||||
elif key is "_layers":
|
||||
return object.__getattribute__(self, "_sub_layers")["_layers"]
|
||||
else:
|
||||
return getattr(
|
||||
object.__getattribute__(self, "_sub_layers")["_layers"], key)
|
||||
|
||||
|
||||
def main(cfg):
|
||||
|
||||
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||
|
||||
if local_rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||
|
||||
global_step = 0
|
||||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if cfg.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'postnet')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
model = ModelPostNet(cfg)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
|
||||
if cfg.checkpoint_path is not None:
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
print("load checkpoint!!!")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = MyDataParallel(model, strategy)
|
||||
|
||||
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader()
|
||||
|
||||
for epoch in range(cfg.epochs):
|
||||
pbar = tqdm(reader)
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
mel, mag = data
|
||||
mag = dg.to_variable(mag.numpy())
|
||||
mel = dg.to_variable(mel.numpy())
|
||||
global_step += 1
|
||||
|
||||
mag_pred = model(mel)
|
||||
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||
print("===============",model.pre_proj.conv.weight.numpy())
|
||||
print("===============",model.pre_proj.conv.weight.gradient())
|
||||
model.clear_gradients()
|
||||
|
||||
if local_rank==0:
|
||||
writer.add_scalars('training_loss',{
|
||||
'loss':loss.numpy(),
|
||||
}, global_step)
|
||||
|
||||
if global_step % cfg.save_step == 0:
|
||||
if not os.path.exists(cfg.save_path):
|
||||
os.mkdir(cfg.save_path)
|
||||
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
|
||||
if local_rank==0:
|
||||
writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
|
||||
main(cfg)
|
|
@ -0,0 +1,150 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
from network import *
|
||||
from tensorboardX import SummaryWriter
|
||||
from pathlib import Path
|
||||
import jsonargparse
|
||||
from parse import add_config_options_to_parser
|
||||
from pprint import pprint
|
||||
from matplotlib import cm
|
||||
from parakeet.modules.utils import cross_entropy
|
||||
from parakeet.models.dataloader.jlspeech import LJSpeechLoader
|
||||
|
||||
class MyDataParallel(dg.parallel.DataParallel):
|
||||
"""
|
||||
A data parallel proxy for model.
|
||||
"""
|
||||
|
||||
def __init__(self, layers, strategy):
|
||||
super(MyDataParallel, self).__init__(layers, strategy)
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key in self.__dict__:
|
||||
return object.__getattribute__(self, key)
|
||||
elif key is "_layers":
|
||||
return object.__getattribute__(self, "_sub_layers")["_layers"]
|
||||
else:
|
||||
return getattr(
|
||||
object.__getattribute__(self, "_sub_layers")["_layers"], key)
|
||||
|
||||
|
||||
def main(cfg):
|
||||
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1
|
||||
|
||||
fluid.default_startup_program().random_seed = 1
|
||||
fluid.default_main_program().random_seed = 1
|
||||
|
||||
if local_rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(cfg))
|
||||
|
||||
global_step = 0
|
||||
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
|
||||
if cfg.use_data_parallel else fluid.CUDAPlace(0)
|
||||
if cfg.use_gpu else fluid.CPUPlace())
|
||||
|
||||
if not os.path.exists(cfg.log_dir):
|
||||
os.mkdir(cfg.log_dir)
|
||||
path = os.path.join(cfg.log_dir,'transformer')
|
||||
|
||||
writer = SummaryWriter(path) if local_rank == 0 else None
|
||||
|
||||
with dg.guard(place):
|
||||
model = TransformerTTS(cfg)
|
||||
|
||||
model.train()
|
||||
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
|
||||
parameter_list=model.parameters())
|
||||
|
||||
reader = LJSpeechLoader(cfg, nranks, local_rank).reader()
|
||||
|
||||
if cfg.checkpoint_path is not None:
|
||||
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
|
||||
model.set_dict(model_dict)
|
||||
optimizer.set_dict(opti_dict)
|
||||
print("load checkpoint!!!")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
model = MyDataParallel(model, strategy)
|
||||
|
||||
for epoch in range(cfg.epochs):
|
||||
pbar = tqdm(reader)
|
||||
|
||||
|
||||
for i, data in enumerate(pbar):
|
||||
pbar.set_description('Processing at epoch %d'%epoch)
|
||||
character, mel, mel_input, pos_text, pos_mel, text_length = data
|
||||
|
||||
global_step += 1
|
||||
|
||||
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
|
||||
|
||||
label = np.zeros(stop_preds.shape).astype(np.float32)
|
||||
text_length = text_length.numpy()
|
||||
for i in range(label.shape[0]):
|
||||
label[i][text_length[i] - 1] = 1
|
||||
|
||||
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
|
||||
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
|
||||
stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
|
||||
loss = mel_loss + post_mel_loss + stop_loss
|
||||
|
||||
if local_rank==0:
|
||||
writer.add_scalars('training_loss', {
|
||||
'mel_loss':mel_loss.numpy(),
|
||||
'post_mel_loss':post_mel_loss.numpy(),
|
||||
'stop_loss':stop_loss.numpy()
|
||||
}, global_step)
|
||||
|
||||
writer.add_scalars('alphas', {
|
||||
'encoder_alpha':model.encoder.alpha.numpy(),
|
||||
'decoder_alpha':model.decoder.alpha.numpy(),
|
||||
}, global_step)
|
||||
|
||||
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
|
||||
|
||||
if global_step % cfg.image_step == 1:
|
||||
for i, prob in enumerate(attn_probs):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
|
||||
for i, prob in enumerate(attn_enc):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
|
||||
for i, prob in enumerate(attn_dec):
|
||||
for j in range(4):
|
||||
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
|
||||
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
|
||||
|
||||
if cfg.use_data_parallel:
|
||||
loss = model.scale_loss(loss)
|
||||
loss.backward()
|
||||
model.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
|
||||
model.clear_gradients()
|
||||
|
||||
# save checkpoint
|
||||
if local_rank==0 and global_step % cfg.save_step == 0:
|
||||
if not os.path.exists(cfg.save_path):
|
||||
os.mkdir(cfg.save_path)
|
||||
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step)
|
||||
dg.save_dygraph(model.state_dict(), save_path)
|
||||
dg.save_dygraph(optimizer.state_dict(), save_path)
|
||||
if local_rank==0:
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ =='__main__':
|
||||
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse')
|
||||
add_config_options_to_parser(parser)
|
||||
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
|
||||
main(cfg)
|
|
@ -0,0 +1,111 @@
|
|||
# WaveFlow with Paddle Fluid
|
||||
|
||||
Paddle fluid implementation of [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219).
|
||||
|
||||
## Project Structure
|
||||
```text
|
||||
├── configs # yaml configuration files of preset model hyperparameters
|
||||
├── benchmark.py # benchmark code to test the speed of batched speech synthesis
|
||||
├── data.py # dataset and dataloader settings for LJSpeech
|
||||
├── synthesis.py # script for speech synthesis
|
||||
├── train.py # script for model training
|
||||
├── utils.py # helper functions for e.g., model checkpointing
|
||||
├── waveflow.py # WaveFlow model high level APIs
|
||||
└── waveflow_modules.py # WaveFlow model implementation
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on.
|
||||
We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset.
|
||||
|
||||
Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`.
|
||||
For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`.
|
||||
|
||||
Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively.
|
||||
|
||||
### Dataset
|
||||
|
||||
Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
|
||||
In this example, assume that the path of unzipped LJSpeech dataset is `./data/LJSpeech-1.1`.
|
||||
|
||||
### Train on single GPU
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python -u train.py \
|
||||
--config=./configs/waveflow_ljspeech.yaml \
|
||||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --batch_size=4 \
|
||||
--parallel=false --use_gpu=true
|
||||
```
|
||||
|
||||
#### Save and Load checkpoints
|
||||
|
||||
Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
|
||||
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
|
||||
|
||||
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
|
||||
1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
|
||||
2. Use `--iteration=500000`.
|
||||
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`.
|
||||
|
||||
### Train on multiple GPUs
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
python -u -m paddle.distributed.launch train.py \
|
||||
--config=./configs/waveflow_ljspeech.yaml \
|
||||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --parallel=true --use_gpu=true
|
||||
```
|
||||
|
||||
Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode.
|
||||
|
||||
### Monitor with Tensorboard
|
||||
|
||||
By default, the logs are saved in `./runs/waveflow/${ModelName}/logs/`. You can monitor logs by tensorboard.
|
||||
|
||||
```bash
|
||||
tensorboard --logdir=${log_dir} --port=8888
|
||||
```
|
||||
|
||||
### Synthesize from a checkpoint
|
||||
|
||||
Check the [Save and load checkpoint](#save-and-load-checkpoints) section on how to load a specific checkpoint.
|
||||
The following example will automatically load the latest checkpoint:
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python -u synthesis.py \
|
||||
--config=./configs/waveflow_ljspeech.yaml \
|
||||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --use_gpu=true \
|
||||
--output=./syn_audios \
|
||||
--sample=${SAMPLE} \
|
||||
--sigma=1.0
|
||||
```
|
||||
|
||||
In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
|
||||
|
||||
### Benchmarking
|
||||
|
||||
Use the following example to benchmark the speed of batched speech synthesis, which reports how many times faster than real-time:
|
||||
|
||||
```bash
|
||||
export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.."
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python -u benchmark.py \
|
||||
--config=./configs/waveflow_ljspeech.yaml \
|
||||
--root=./data/LJSpeech-1.1 \
|
||||
--name=${ModelName} --use_gpu=true
|
||||
```
|
|
@ -0,0 +1,71 @@
|
|||
import os
|
||||
import random
|
||||
from pprint import pprint
|
||||
|
||||
import jsonargparse
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
|
||||
import utils
|
||||
from waveflow import WaveFlow
|
||||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
parser.add_argument('--model', type=str, default='waveflow',
|
||||
help="general name of the model")
|
||||
parser.add_argument('--name', type=str,
|
||||
help="specific name of the training model")
|
||||
parser.add_argument('--root', type=str,
|
||||
help="root path of the LJSpeech dataset")
|
||||
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="option to use gpu training")
|
||||
|
||||
parser.add_argument('--iteration', type=int, default=None,
|
||||
help=("which iteration of checkpoint to load, "
|
||||
"default to load the latest checkpoint"))
|
||||
parser.add_argument('--checkpoint', type=str, default=None,
|
||||
help="path of the checkpoint to load")
|
||||
|
||||
|
||||
def benchmark(config):
|
||||
pprint(jsonargparse.namespace_to_dict(config))
|
||||
|
||||
# Get checkpoint directory path.
|
||||
run_dir = os.path.join("runs", config.model, config.name)
|
||||
checkpoint_dir = os.path.join(run_dir, "checkpoint")
|
||||
|
||||
# Configurate device.
|
||||
place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
|
||||
|
||||
with dg.guard(place):
|
||||
# Fix random seed.
|
||||
seed = config.seed
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
fluid.default_startup_program().random_seed = seed
|
||||
fluid.default_main_program().random_seed = seed
|
||||
print("Random Seed: ", seed)
|
||||
|
||||
# Build model.
|
||||
model = WaveFlow(config, checkpoint_dir)
|
||||
model.build(training=False)
|
||||
|
||||
# Run model inference.
|
||||
model.benchmark()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create parser.
|
||||
parser = jsonargparse.ArgumentParser(
|
||||
description="Synthesize audio using WaveNet model",
|
||||
formatter_class='default_argparse')
|
||||
add_options_to_parser(parser)
|
||||
utils.add_config_options_to_parser(parser)
|
||||
|
||||
# Parse argument from both command line and yaml config file.
|
||||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
benchmark(config)
|
|
@ -0,0 +1,24 @@
|
|||
valid_size: 16
|
||||
segment_length: 16000
|
||||
sample_rate: 22050
|
||||
fft_window_shift: 256
|
||||
fft_window_size: 1024
|
||||
fft_size: 1024
|
||||
mel_bands: 80
|
||||
mel_fmin: 0.0
|
||||
mel_fmax: 8000.0
|
||||
|
||||
seed: 1234
|
||||
learning_rate: 0.0002
|
||||
batch_size: 8
|
||||
test_every: 2000
|
||||
save_every: 10000
|
||||
max_iterations: 3000000
|
||||
|
||||
sigma: 1.0
|
||||
n_flows: 8
|
||||
n_group: 16
|
||||
n_layers: 8
|
||||
n_channels: 64
|
||||
kernel_h: 3
|
||||
kernel_w: 3
|
|
@ -0,0 +1,131 @@
|
|||
import random
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from paddle import fluid
|
||||
|
||||
from parakeet.datasets import ljspeech
|
||||
from parakeet.data import dataset
|
||||
from parakeet.data.batch import SpecBatcher, WavBatcher
|
||||
from parakeet.data.datacargo import DataCargo
|
||||
from parakeet.data.sampler import DistributedSampler, BatchSampler
|
||||
from scipy.io.wavfile import read
|
||||
|
||||
|
||||
class Dataset(ljspeech.LJSpeech):
|
||||
def __init__(self, config):
|
||||
super(Dataset, self).__init__(config.root)
|
||||
self.config = config
|
||||
|
||||
def _get_example(self, metadatum):
|
||||
fname, _, _ = metadatum
|
||||
wav_path = self.root.joinpath("wavs", fname + ".wav")
|
||||
|
||||
loaded_sr, audio = read(wav_path)
|
||||
assert loaded_sr == self.config.sample_rate
|
||||
|
||||
return audio
|
||||
|
||||
|
||||
class Subset(dataset.Dataset):
|
||||
def __init__(self, dataset, indices, valid):
|
||||
self.dataset = dataset
|
||||
self.indices = indices
|
||||
self.valid = valid
|
||||
self.config = dataset.config
|
||||
|
||||
def get_mel(self, audio):
|
||||
spectrogram = librosa.core.stft(
|
||||
audio, n_fft=self.config.fft_size,
|
||||
hop_length=self.config.fft_window_shift,
|
||||
win_length=self.config.fft_window_size)
|
||||
spectrogram_magnitude = np.abs(spectrogram)
|
||||
|
||||
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
|
||||
mel_filter_bank = librosa.filters.mel(
|
||||
sr=self.config.sample_rate,
|
||||
n_fft=self.config.fft_size,
|
||||
n_mels=self.config.mel_bands,
|
||||
fmin=self.config.mel_fmin,
|
||||
fmax=self.config.mel_fmax)
|
||||
# mel shape: [n_mels, num_frames]
|
||||
mel = np.dot(mel_filter_bank, spectrogram_magnitude)
|
||||
|
||||
# Normalize mel.
|
||||
clip_val = 1e-5
|
||||
ref_constant = 1
|
||||
mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant)
|
||||
|
||||
return mel
|
||||
|
||||
def __getitem__(self, idx):
|
||||
audio = self.dataset[self.indices[idx]]
|
||||
segment_length = self.config.segment_length
|
||||
|
||||
if self.valid:
|
||||
# whole audio for valid set
|
||||
pass
|
||||
else:
|
||||
# audio shape: [len]
|
||||
if audio.shape[0] >= segment_length:
|
||||
max_audio_start = audio.shape[0] - segment_length
|
||||
audio_start = random.randint(0, max_audio_start)
|
||||
audio = audio[audio_start : (audio_start + segment_length)]
|
||||
else:
|
||||
audio = np.pad(audio, (0, segment_length - audio.shape[0]),
|
||||
mode='constant', constant_values=0)
|
||||
|
||||
# Normalize audio to the [-1, 1] range.
|
||||
audio = audio.astype(np.float32) / 32768.0
|
||||
mel = self.get_mel(audio)
|
||||
|
||||
return audio, mel
|
||||
|
||||
def _batch_examples(self, batch):
|
||||
audios = [sample[0] for sample in batch]
|
||||
mels = [sample[1] for sample in batch]
|
||||
|
||||
audios = WavBatcher(pad_value=0.0)(audios)
|
||||
mels = SpecBatcher(pad_value=0.0)(mels)
|
||||
|
||||
return audios, mels
|
||||
|
||||
def __len__(self):
|
||||
return len(self.indices)
|
||||
|
||||
|
||||
class LJSpeech:
|
||||
def __init__(self, config, nranks, rank):
|
||||
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace()
|
||||
|
||||
# Whole LJSpeech dataset.
|
||||
ds = Dataset(config)
|
||||
|
||||
# Split into train and valid dataset.
|
||||
indices = list(range(len(ds)))
|
||||
train_indices = indices[config.valid_size:]
|
||||
valid_indices = indices[:config.valid_size]
|
||||
random.shuffle(train_indices)
|
||||
|
||||
# Train dataset.
|
||||
trainset = Subset(ds, train_indices, valid=False)
|
||||
sampler = DistributedSampler(len(trainset), nranks, rank)
|
||||
total_bs = config.batch_size
|
||||
assert total_bs % nranks == 0
|
||||
train_sampler = BatchSampler(sampler, total_bs // nranks,
|
||||
drop_last=True)
|
||||
trainloader = DataCargo(trainset, batch_sampler=train_sampler)
|
||||
|
||||
trainreader = fluid.io.PyReader(capacity=50, return_list=True)
|
||||
trainreader.decorate_batch_generator(trainloader, place)
|
||||
self.trainloader = (data for _ in iter(int, 1)
|
||||
for data in trainreader())
|
||||
|
||||
# Valid dataset.
|
||||
validset = Subset(ds, valid_indices, valid=True)
|
||||
# Currently only support batch_size = 1 for valid loader.
|
||||
validloader = DataCargo(validset, batch_size=1, shuffle=False)
|
||||
|
||||
validreader = fluid.io.PyReader(capacity=20, return_list=True)
|
||||
validreader.decorate_batch_generator(validloader, place)
|
||||
self.validloader = validreader
|
|
@ -0,0 +1,85 @@
|
|||
import os
|
||||
import random
|
||||
from pprint import pprint
|
||||
|
||||
import jsonargparse
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
|
||||
import utils
|
||||
from waveflow import WaveFlow
|
||||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
parser.add_argument('--model', type=str, default='waveflow',
|
||||
help="general name of the model")
|
||||
parser.add_argument('--name', type=str,
|
||||
help="specific name of the training model")
|
||||
parser.add_argument('--root', type=str,
|
||||
help="root path of the LJSpeech dataset")
|
||||
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="option to use gpu training")
|
||||
|
||||
parser.add_argument('--iteration', type=int, default=None,
|
||||
help=("which iteration of checkpoint to load, "
|
||||
"default to load the latest checkpoint"))
|
||||
parser.add_argument('--checkpoint', type=str, default=None,
|
||||
help="path of the checkpoint to load")
|
||||
|
||||
parser.add_argument('--output', type=str, default="./syn_audios",
|
||||
help="path to write synthesized audio files")
|
||||
parser.add_argument('--sample', type=int, default=None,
|
||||
help="which of the valid samples to synthesize audio")
|
||||
|
||||
|
||||
def synthesize(config):
|
||||
pprint(jsonargparse.namespace_to_dict(config))
|
||||
|
||||
# Get checkpoint directory path.
|
||||
run_dir = os.path.join("runs", config.model, config.name)
|
||||
checkpoint_dir = os.path.join(run_dir, "checkpoint")
|
||||
|
||||
# Configurate device.
|
||||
place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
|
||||
|
||||
with dg.guard(place):
|
||||
# Fix random seed.
|
||||
seed = config.seed
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
fluid.default_startup_program().random_seed = seed
|
||||
fluid.default_main_program().random_seed = seed
|
||||
print("Random Seed: ", seed)
|
||||
|
||||
# Build model.
|
||||
model = WaveFlow(config, checkpoint_dir)
|
||||
model.build(training=False)
|
||||
|
||||
# Obtain the current iteration.
|
||||
if config.checkpoint is None:
|
||||
if config.iteration is None:
|
||||
iteration = utils.load_latest_checkpoint(checkpoint_dir)
|
||||
else:
|
||||
iteration = config.iteration
|
||||
else:
|
||||
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
|
||||
|
||||
# Run model inference.
|
||||
model.infer(iteration)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create parser.
|
||||
parser = jsonargparse.ArgumentParser(
|
||||
description="Synthesize audio using WaveNet model",
|
||||
formatter_class='default_argparse')
|
||||
add_options_to_parser(parser)
|
||||
utils.add_config_options_to_parser(parser)
|
||||
|
||||
# Parse argument from both command line and yaml config file.
|
||||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
synthesize(config)
|
|
@ -0,0 +1,114 @@
|
|||
import os
|
||||
import random
|
||||
import subprocess
|
||||
import time
|
||||
from pprint import pprint
|
||||
|
||||
import jsonargparse
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
import slurm
|
||||
import utils
|
||||
from waveflow import WaveFlow
|
||||
|
||||
|
||||
def add_options_to_parser(parser):
|
||||
parser.add_argument('--model', type=str, default='waveflow',
|
||||
help="general name of the model")
|
||||
parser.add_argument('--name', type=str,
|
||||
help="specific name of the training model")
|
||||
parser.add_argument('--root', type=str,
|
||||
help="root path of the LJSpeech dataset")
|
||||
|
||||
parser.add_argument('--parallel', type=bool, default=True,
|
||||
help="option to use data parallel training")
|
||||
parser.add_argument('--use_gpu', type=bool, default=True,
|
||||
help="option to use gpu training")
|
||||
|
||||
parser.add_argument('--iteration', type=int, default=None,
|
||||
help=("which iteration of checkpoint to load, "
|
||||
"default to load the latest checkpoint"))
|
||||
parser.add_argument('--checkpoint', type=str, default=None,
|
||||
help="path of the checkpoint to load")
|
||||
|
||||
|
||||
def train(config):
|
||||
use_gpu = config.use_gpu
|
||||
parallel = config.parallel if use_gpu else False
|
||||
|
||||
# Get the rank of the current training process.
|
||||
rank = dg.parallel.Env().local_rank if parallel else 0
|
||||
nranks = dg.parallel.Env().nranks if parallel else 1
|
||||
|
||||
if rank == 0:
|
||||
# Print the whole config setting.
|
||||
pprint(jsonargparse.namespace_to_dict(config))
|
||||
|
||||
# Make checkpoint directory.
|
||||
run_dir = os.path.join("runs", config.model, config.name)
|
||||
checkpoint_dir = os.path.join(run_dir, "checkpoint")
|
||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||
|
||||
# Create tensorboard logger.
|
||||
tb = SummaryWriter(os.path.join(run_dir, "logs")) \
|
||||
if rank == 0 else None
|
||||
|
||||
# Configurate device
|
||||
place = fluid.CUDAPlace(rank) if use_gpu else fluid.CPUPlace()
|
||||
|
||||
with dg.guard(place):
|
||||
# Fix random seed.
|
||||
seed = config.seed
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
fluid.default_startup_program().random_seed = seed
|
||||
fluid.default_main_program().random_seed = seed
|
||||
print("Random Seed: ", seed)
|
||||
|
||||
# Build model.
|
||||
model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb)
|
||||
model.build()
|
||||
|
||||
# Obtain the current iteration.
|
||||
if config.checkpoint is None:
|
||||
if config.iteration is None:
|
||||
iteration = utils.load_latest_checkpoint(checkpoint_dir, rank)
|
||||
else:
|
||||
iteration = config.iteration
|
||||
else:
|
||||
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
|
||||
|
||||
while iteration < config.max_iterations:
|
||||
# Run one single training step.
|
||||
model.train_step(iteration)
|
||||
|
||||
iteration += 1
|
||||
|
||||
if iteration % config.test_every == 0:
|
||||
# Run validation step.
|
||||
model.valid_step(iteration)
|
||||
|
||||
if rank == 0 and iteration % config.save_every == 0:
|
||||
# Save parameters.
|
||||
model.save(iteration)
|
||||
|
||||
# Close TensorBoard.
|
||||
if rank == 0:
|
||||
tb.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create parser.
|
||||
parser = jsonargparse.ArgumentParser(description="Train WaveFlow model",
|
||||
formatter_class='default_argparse')
|
||||
add_options_to_parser(parser)
|
||||
utils.add_config_options_to_parser(parser)
|
||||
|
||||
# Parse argument from both command line and yaml config file.
|
||||
# For conflicting updates to the same field,
|
||||
# the preceding update will be overwritten by the following one.
|
||||
config = parser.parse_args()
|
||||
train(config)
|
|
@ -0,0 +1,114 @@
|
|||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
||||
import jsonargparse
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
def add_config_options_to_parser(parser):
|
||||
parser.add_argument('--valid_size', type=int,
|
||||
help="size of the valid dataset")
|
||||
parser.add_argument('--segment_length', type=int,
|
||||
help="the length of audio clip for training")
|
||||
parser.add_argument('--sample_rate', type=int,
|
||||
help="sampling rate of audio data file")
|
||||
parser.add_argument('--fft_window_shift', type=int,
|
||||
help="the shift of fft window for each frame")
|
||||
parser.add_argument('--fft_window_size', type=int,
|
||||
help="the size of fft window for each frame")
|
||||
parser.add_argument('--fft_size', type=int,
|
||||
help="the size of fft filter on each frame")
|
||||
parser.add_argument('--mel_bands', type=int,
|
||||
help="the number of mel bands when calculating mel spectrograms")
|
||||
parser.add_argument('--mel_fmin', type=float,
|
||||
help="lowest frequency in calculating mel spectrograms")
|
||||
parser.add_argument('--mel_fmax', type=float,
|
||||
help="highest frequency in calculating mel spectrograms")
|
||||
|
||||
parser.add_argument('--seed', type=int,
|
||||
help="seed of random initialization for the model")
|
||||
parser.add_argument('--learning_rate', type=float)
|
||||
parser.add_argument('--batch_size', type=int,
|
||||
help="batch size for training")
|
||||
parser.add_argument('--test_every', type=int,
|
||||
help="test interval during training")
|
||||
parser.add_argument('--save_every', type=int,
|
||||
help="checkpointing interval during training")
|
||||
parser.add_argument('--max_iterations', type=int,
|
||||
help="maximum training iterations")
|
||||
|
||||
parser.add_argument('--sigma', type=float,
|
||||
help="standard deviation of the latent Gaussian variable")
|
||||
parser.add_argument('--n_flows', type=int,
|
||||
help="number of flows")
|
||||
parser.add_argument('--n_group', type=int,
|
||||
help="number of adjacent audio samples to squeeze into one column")
|
||||
parser.add_argument('--n_layers', type=int,
|
||||
help="number of conv2d layer in one wavenet-like flow architecture")
|
||||
parser.add_argument('--n_channels', type=int,
|
||||
help="number of residual channels in flow")
|
||||
parser.add_argument('--kernel_h', type=int,
|
||||
help="height of the kernel in the conv2d layer")
|
||||
parser.add_argument('--kernel_w', type=int,
|
||||
help="width of the kernel in the conv2d layer")
|
||||
|
||||
parser.add_argument('--config', action=jsonargparse.ActionConfigFile)
|
||||
|
||||
|
||||
def load_latest_checkpoint(checkpoint_dir, rank=0):
|
||||
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
|
||||
# Create checkpoint index file if not exist.
|
||||
if (not os.path.isfile(checkpoint_path)) and rank == 0:
|
||||
with open(checkpoint_path, "w") as handle:
|
||||
handle.write("model_checkpoint_path: step-0")
|
||||
|
||||
# Make sure that other process waits until checkpoint file is created
|
||||
# by process 0.
|
||||
while not os.path.isfile(checkpoint_path):
|
||||
time.sleep(1)
|
||||
|
||||
# Fetch the latest checkpoint index.
|
||||
with open(checkpoint_path, "r") as handle:
|
||||
latest_checkpoint = handle.readline().split()[-1]
|
||||
iteration = int(latest_checkpoint.split("-")[-1])
|
||||
|
||||
return iteration
|
||||
|
||||
|
||||
def save_latest_checkpoint(checkpoint_dir, iteration):
|
||||
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
|
||||
# Update the latest checkpoint index.
|
||||
with open(checkpoint_path, "w") as handle:
|
||||
handle.write("model_checkpoint_path: step-{}".format(iteration))
|
||||
|
||||
|
||||
def load_parameters(checkpoint_dir, rank, model, optimizer=None,
|
||||
iteration=None, file_path=None):
|
||||
if file_path is None:
|
||||
if iteration is None:
|
||||
iteration = load_latest_checkpoint(checkpoint_dir, rank)
|
||||
if iteration == 0:
|
||||
return
|
||||
file_path = "{}/step-{}".format(checkpoint_dir, iteration)
|
||||
|
||||
model_dict, optimizer_dict = dg.load_dygraph(file_path)
|
||||
model.set_dict(model_dict)
|
||||
print("[checkpoint] Rank {}: loaded model from {}".format(rank, file_path))
|
||||
if optimizer and optimizer_dict:
|
||||
optimizer.set_dict(optimizer_dict)
|
||||
print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
|
||||
rank, file_path))
|
||||
|
||||
|
||||
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
|
||||
file_path = "{}/step-{}".format(checkpoint_dir, iteration)
|
||||
model_dict = model.state_dict()
|
||||
dg.save_dygraph(model_dict, file_path)
|
||||
print("[checkpoint] Saved model to {}".format(file_path))
|
||||
|
||||
if optimizer:
|
||||
opt_dict = optimizer.state_dict()
|
||||
dg.save_dygraph(opt_dict, file_path)
|
||||
print("[checkpoint] Saved optimzier state to {}".format(file_path))
|
|
@ -0,0 +1,190 @@
|
|||
import itertools
|
||||
import os
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
import utils
|
||||
from data import LJSpeech
|
||||
from waveflow_modules import WaveFlowLoss, WaveFlowModule
|
||||
|
||||
|
||||
class WaveFlow():
|
||||
def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
|
||||
nranks=1, tb_logger=None):
|
||||
self.config = config
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
self.parallel = parallel
|
||||
self.rank = rank
|
||||
self.nranks = nranks
|
||||
self.tb_logger = tb_logger
|
||||
|
||||
def build(self, training=True):
|
||||
config = self.config
|
||||
dataset = LJSpeech(config, self.nranks, self.rank)
|
||||
self.trainloader = dataset.trainloader
|
||||
self.validloader = dataset.validloader
|
||||
|
||||
waveflow = WaveFlowModule("waveflow", config)
|
||||
|
||||
# Dry run once to create and initalize all necessary parameters.
|
||||
audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32))
|
||||
mel = dg.to_variable(
|
||||
np.random.randn(1, config.mel_bands, 63).astype(np.float32))
|
||||
waveflow(audio, mel)
|
||||
|
||||
if training:
|
||||
optimizer = fluid.optimizer.AdamOptimizer(
|
||||
learning_rate=config.learning_rate)
|
||||
|
||||
# Load parameters.
|
||||
utils.load_parameters(self.checkpoint_dir, self.rank,
|
||||
waveflow, optimizer,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||
|
||||
# Data parallelism.
|
||||
if self.parallel:
|
||||
strategy = dg.parallel.prepare_context()
|
||||
waveflow = dg.parallel.DataParallel(waveflow, strategy)
|
||||
|
||||
self.waveflow = waveflow
|
||||
self.optimizer = optimizer
|
||||
self.criterion = WaveFlowLoss(config.sigma)
|
||||
|
||||
else:
|
||||
# Load parameters.
|
||||
utils.load_parameters(self.checkpoint_dir, self.rank, waveflow,
|
||||
iteration=config.iteration,
|
||||
file_path=config.checkpoint)
|
||||
print("Rank {}: checkpoint loaded.".format(self.rank))
|
||||
|
||||
self.waveflow = waveflow
|
||||
|
||||
def train_step(self, iteration):
|
||||
self.waveflow.train()
|
||||
|
||||
start_time = time.time()
|
||||
audios, mels = next(self.trainloader)
|
||||
load_time = time.time()
|
||||
|
||||
outputs = self.waveflow(audios, mels)
|
||||
loss = self.criterion(outputs)
|
||||
|
||||
if self.parallel:
|
||||
# loss = loss / num_trainers
|
||||
loss = self.waveflow.scale_loss(loss)
|
||||
loss.backward()
|
||||
self.waveflow.apply_collective_grads()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters())
|
||||
self.waveflow.clear_gradients()
|
||||
|
||||
graph_time = time.time()
|
||||
|
||||
if self.rank == 0:
|
||||
loss_val = float(loss.numpy()) * self.nranks
|
||||
log = "Rank: {} Step: {:^8d} Loss: {:<8.3f} " \
|
||||
"Time: {:.3f}/{:.3f}".format(
|
||||
self.rank, iteration, loss_val,
|
||||
load_time - start_time, graph_time - load_time)
|
||||
print(log)
|
||||
|
||||
tb = self.tb_logger
|
||||
tb.add_scalar("Train-Loss-Rank-0", loss_val, iteration)
|
||||
|
||||
@dg.no_grad
|
||||
def valid_step(self, iteration):
|
||||
self.waveflow.eval()
|
||||
tb = self.tb_logger
|
||||
|
||||
total_loss = []
|
||||
sample_audios = []
|
||||
start_time = time.time()
|
||||
|
||||
for i, batch in enumerate(self.validloader()):
|
||||
audios, mels = batch
|
||||
valid_outputs = self.waveflow(audios, mels)
|
||||
valid_z, valid_log_s_list = valid_outputs
|
||||
|
||||
# Visualize latent z and scale log_s.
|
||||
if self.rank == 0 and i == 0:
|
||||
tb.add_histogram("Valid-Latent_z", valid_z.numpy(), iteration)
|
||||
for j, valid_log_s in enumerate(valid_log_s_list):
|
||||
hist_name = "Valid-{}th-Flow-Log_s".format(j)
|
||||
tb.add_histogram(hist_name, valid_log_s.numpy(), iteration)
|
||||
|
||||
valid_loss = self.criterion(valid_outputs)
|
||||
total_loss.append(float(valid_loss.numpy()))
|
||||
|
||||
total_time = time.time() - start_time
|
||||
if self.rank == 0:
|
||||
loss_val = np.mean(total_loss)
|
||||
log = "Test | Rank: {} AvgLoss: {:<8.3f} Time {:<8.3f}".format(
|
||||
self.rank, loss_val, total_time)
|
||||
print(log)
|
||||
tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
|
||||
|
||||
@dg.no_grad
|
||||
def infer(self, iteration):
|
||||
self.waveflow.eval()
|
||||
|
||||
config = self.config
|
||||
sample = config.sample
|
||||
|
||||
output = "{}/{}/iter-{}".format(config.output, config.name, iteration)
|
||||
os.makedirs(output, exist_ok=True)
|
||||
|
||||
mels_list = [mels for _, mels in self.validloader()]
|
||||
if sample is not None:
|
||||
mels_list = [mels_list[sample]]
|
||||
|
||||
for sample, mel in enumerate(mels_list):
|
||||
filename = "{}/valid_{}.wav".format(output, sample)
|
||||
print("Synthesize sample {}, save as {}".format(sample, filename))
|
||||
|
||||
start_time = time.time()
|
||||
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
|
||||
syn_time = time.time() - start_time
|
||||
|
||||
audio = audio[0]
|
||||
audio_time = audio.shape[0] / self.config.sample_rate
|
||||
print("audio time {:.4f}, synthesis time {:.4f}".format(
|
||||
audio_time, syn_time))
|
||||
|
||||
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
|
||||
audio = audio.numpy() * 32768.0
|
||||
audio = audio.astype('int16')
|
||||
write(filename, config.sample_rate, audio)
|
||||
|
||||
@dg.no_grad
|
||||
def benchmark(self):
|
||||
self.waveflow.eval()
|
||||
|
||||
mels_list = [mels for _, mels in self.validloader()]
|
||||
mel = fluid.layers.concat(mels_list, axis=2)
|
||||
mel = mel[:, :, :864]
|
||||
batch_size = 8
|
||||
mel = fluid.layers.expand(mel, [batch_size, 1, 1])
|
||||
|
||||
for i in range(10):
|
||||
start_time = time.time()
|
||||
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
|
||||
print("audio.shape = ", audio.shape)
|
||||
syn_time = time.time() - start_time
|
||||
|
||||
audio_time = audio.shape[1] * batch_size / self.config.sample_rate
|
||||
print("audio time {:.4f}, synthesis time {:.4f}".format(
|
||||
audio_time, syn_time))
|
||||
print("{} X real-time".format(audio_time / syn_time))
|
||||
|
||||
def save(self, iteration):
|
||||
utils.save_latest_parameters(self.checkpoint_dir, iteration,
|
||||
self.waveflow, self.optimizer)
|
||||
utils.save_latest_checkpoint(self.checkpoint_dir, iteration)
|
|
@ -0,0 +1,351 @@
|
|||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
from paddle import fluid
|
||||
from parakeet.modules import conv, modules, weight_norm
|
||||
|
||||
|
||||
def set_param_attr(layer, c_in=1):
|
||||
if isinstance(layer, (weight_norm.Conv2DTranspose, weight_norm.Conv2D)):
|
||||
k = np.sqrt(1.0 / (c_in * np.prod(layer._filter_size)))
|
||||
weight_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
||||
bias_init = fluid.initializer.UniformInitializer(low=-k, high=k)
|
||||
elif isinstance(layer, dg.Conv2D):
|
||||
weight_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
bias_init = fluid.initializer.ConstantInitializer(0.0)
|
||||
else:
|
||||
raise TypeError("Unsupported layer type.")
|
||||
|
||||
layer._param_attr = fluid.ParamAttr(initializer=weight_init)
|
||||
layer._bias_attr = fluid.ParamAttr(initializer=bias_init)
|
||||
|
||||
|
||||
def unfold(x, n_group):
|
||||
length = x.shape[-1]
|
||||
new_shape = x.shape[:-1] + [length // n_group, n_group]
|
||||
return fluid.layers.reshape(x, new_shape)
|
||||
|
||||
|
||||
class WaveFlowLoss:
|
||||
def __init__(self, sigma=1.0):
|
||||
self.sigma = sigma
|
||||
|
||||
def __call__(self, model_output):
|
||||
z, log_s_list = model_output
|
||||
for i, log_s in enumerate(log_s_list):
|
||||
if i == 0:
|
||||
log_s_total = fluid.layers.reduce_sum(log_s)
|
||||
else:
|
||||
log_s_total = log_s_total + fluid.layers.reduce_sum(log_s)
|
||||
|
||||
loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \
|
||||
- log_s_total
|
||||
loss = loss / np.prod(z.shape)
|
||||
const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
|
||||
|
||||
return loss + const
|
||||
|
||||
|
||||
class Conditioner(dg.Layer):
|
||||
def __init__(self, name_scope):
|
||||
super(Conditioner, self).__init__(name_scope)
|
||||
upsample_factors = [16, 16]
|
||||
|
||||
self.upsample_conv2d = []
|
||||
for s in upsample_factors:
|
||||
in_channel = 1
|
||||
conv_trans2d = modules.Conv2DTranspose(
|
||||
self.full_name(),
|
||||
num_filters=1,
|
||||
filter_size=(3, 2 * s),
|
||||
padding=(1, s // 2),
|
||||
stride=(1, s))
|
||||
set_param_attr(conv_trans2d, c_in=in_channel)
|
||||
self.upsample_conv2d.append(conv_trans2d)
|
||||
|
||||
for i, layer in enumerate(self.upsample_conv2d):
|
||||
self.add_sublayer("conv2d_transpose_{}".format(i), layer)
|
||||
|
||||
def forward(self, x):
|
||||
x = fluid.layers.unsqueeze(x, 1)
|
||||
for layer in self.upsample_conv2d:
|
||||
x = fluid.layers.leaky_relu(layer(x), alpha=0.4)
|
||||
|
||||
return fluid.layers.squeeze(x, [1])
|
||||
|
||||
def infer(self, x):
|
||||
x = fluid.layers.unsqueeze(x, 1)
|
||||
for layer in self.upsample_conv2d:
|
||||
x = layer(x)
|
||||
# Trim conv artifacts.
|
||||
time_cutoff = layer._filter_size[1] - layer._stride[1]
|
||||
x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4)
|
||||
|
||||
return fluid.layers.squeeze(x, [1])
|
||||
|
||||
|
||||
class Flow(dg.Layer):
|
||||
def __init__(self, name_scope, config):
|
||||
super(Flow, self).__init__(name_scope)
|
||||
self.n_layers = config.n_layers
|
||||
self.n_channels = config.n_channels
|
||||
self.kernel_h = config.kernel_h
|
||||
self.kernel_w = config.kernel_w
|
||||
|
||||
# Transform audio: [batch, 1, n_group, time/n_group]
|
||||
# => [batch, n_channels, n_group, time/n_group]
|
||||
self.start = weight_norm.Conv2D(
|
||||
self.full_name(),
|
||||
num_filters=self.n_channels,
|
||||
filter_size=(1, 1))
|
||||
set_param_attr(self.start, c_in=1)
|
||||
|
||||
# Initializing last layer to 0 makes the affine coupling layers
|
||||
# do nothing at first. This helps with training stability
|
||||
# output shape: [batch, 2, n_group, time/n_group]
|
||||
self.end = dg.Conv2D(
|
||||
self.full_name(),
|
||||
num_filters=2,
|
||||
filter_size=(1, 1))
|
||||
set_param_attr(self.end)
|
||||
|
||||
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
|
||||
dilation_dict = {8: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
||||
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
||||
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
||||
128: [1, 2, 4, 8, 16, 32, 64, 1]}
|
||||
self.dilation_h_list = dilation_dict[config.n_group]
|
||||
|
||||
self.in_layers = []
|
||||
self.cond_layers = []
|
||||
self.res_skip_layers = []
|
||||
for i in range(self.n_layers):
|
||||
dilation_h = self.dilation_h_list[i]
|
||||
dilation_w = 2 ** i
|
||||
|
||||
in_layer = weight_norm.Conv2D(
|
||||
self.full_name(),
|
||||
num_filters=2 * self.n_channels,
|
||||
filter_size=(self.kernel_h, self.kernel_w),
|
||||
dilation=(dilation_h, dilation_w))
|
||||
set_param_attr(in_layer, c_in=self.n_channels)
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
cond_layer = weight_norm.Conv2D(
|
||||
self.full_name(),
|
||||
num_filters=2 * self.n_channels,
|
||||
filter_size=(1, 1))
|
||||
set_param_attr(cond_layer, c_in=config.mel_bands)
|
||||
self.cond_layers.append(cond_layer)
|
||||
|
||||
if i < self.n_layers - 1:
|
||||
res_skip_channels = 2 * self.n_channels
|
||||
else:
|
||||
res_skip_channels = self.n_channels
|
||||
res_skip_layer = weight_norm.Conv2D(
|
||||
self.full_name(),
|
||||
num_filters=res_skip_channels,
|
||||
filter_size=(1, 1))
|
||||
set_param_attr(res_skip_layer, c_in=self.n_channels)
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
self.add_sublayer("in_layer_{}".format(i), in_layer)
|
||||
self.add_sublayer("cond_layer_{}".format(i), cond_layer)
|
||||
self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer)
|
||||
|
||||
def forward(self, audio, mel):
|
||||
# audio: [bs, 1, n_group, time/group]
|
||||
# mel: [bs, mel_bands, n_group, time/n_group]
|
||||
audio = self.start(audio)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
dilation_h = self.dilation_h_list[i]
|
||||
dilation_w = 2 ** i
|
||||
|
||||
# Pad height dim (n_group): causal convolution
|
||||
# Pad width dim (time): dialated non-causal convolution
|
||||
pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0
|
||||
pad_left = pad_right = int((self.kernel_w-1) * dilation_w / 2)
|
||||
audio_pad = fluid.layers.pad2d(audio,
|
||||
paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
||||
|
||||
hidden = self.in_layers[i](audio_pad)
|
||||
cond_hidden = self.cond_layers[i](mel)
|
||||
in_acts = hidden + cond_hidden
|
||||
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
|
||||
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
|
||||
res_skip_acts = self.res_skip_layers[i](out_acts)
|
||||
|
||||
if i < self.n_layers - 1:
|
||||
audio += res_skip_acts[:, :self.n_channels, :, :]
|
||||
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output += skip_acts
|
||||
|
||||
return self.end(output)
|
||||
|
||||
def infer(self, audio, mel, queues):
|
||||
audio = self.start(audio)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
dilation_h = self.dilation_h_list[i]
|
||||
dilation_w = 2 ** i
|
||||
|
||||
state_size = dilation_h * (self.kernel_h - 1)
|
||||
queue = queues[i]
|
||||
|
||||
if len(queue) == 0:
|
||||
for j in range(state_size):
|
||||
queue.append(fluid.layers.zeros_like(audio))
|
||||
|
||||
state = queue[0:state_size]
|
||||
state = fluid.layers.concat([*state, audio], axis=2)
|
||||
|
||||
queue.pop(0)
|
||||
queue.append(audio)
|
||||
|
||||
# Pad height dim (n_group): causal convolution
|
||||
# Pad width dim (time): dialated non-causal convolution
|
||||
pad_top, pad_bottom = 0, 0
|
||||
pad_left = int((self.kernel_w-1) * dilation_w / 2)
|
||||
pad_right = int((self.kernel_w-1) * dilation_w / 2)
|
||||
state = fluid.layers.pad2d(state,
|
||||
paddings=[pad_top, pad_bottom, pad_left, pad_right])
|
||||
|
||||
hidden = self.in_layers[i](state)
|
||||
cond_hidden = self.cond_layers[i](mel)
|
||||
in_acts = hidden + cond_hidden
|
||||
out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \
|
||||
fluid.layers.sigmoid(in_acts[:, self.n_channels:, :])
|
||||
res_skip_acts = self.res_skip_layers[i](out_acts)
|
||||
|
||||
if i < self.n_layers - 1:
|
||||
audio += res_skip_acts[:, :self.n_channels, :, :]
|
||||
skip_acts = res_skip_acts[:, self.n_channels:, :, :]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output += skip_acts
|
||||
|
||||
return self.end(output)
|
||||
|
||||
|
||||
class WaveFlowModule(dg.Layer):
|
||||
def __init__(self, name_scope, config):
|
||||
super(WaveFlowModule, self).__init__(name_scope)
|
||||
self.n_flows = config.n_flows
|
||||
self.n_group = config.n_group
|
||||
self.n_layers = config.n_layers
|
||||
assert self.n_group % 2 == 0
|
||||
assert self.n_flows % 2 == 0
|
||||
|
||||
self.conditioner = Conditioner(self.full_name())
|
||||
self.flows = []
|
||||
for i in range(self.n_flows):
|
||||
flow = Flow(self.full_name(), config)
|
||||
self.flows.append(flow)
|
||||
self.add_sublayer("flow_{}".format(i), flow)
|
||||
|
||||
self.perms = []
|
||||
half = self.n_group // 2
|
||||
for i in range(self.n_flows):
|
||||
perm = list(range(self.n_group))
|
||||
if i < self.n_flows // 2:
|
||||
perm = perm[::-1]
|
||||
else:
|
||||
perm[:half] = reversed(perm[:half])
|
||||
perm[half:] = reversed(perm[half:])
|
||||
self.perms.append(perm)
|
||||
|
||||
def forward(self, audio, mel):
|
||||
mel = self.conditioner(mel)
|
||||
assert mel.shape[2] >= audio.shape[1]
|
||||
# Prune out the tail of audio/mel so that time/n_group == 0.
|
||||
pruned_len = audio.shape[1] // self.n_group * self.n_group
|
||||
|
||||
if audio.shape[1] > pruned_len:
|
||||
audio = audio[:, :pruned_len]
|
||||
if mel.shape[2] > pruned_len:
|
||||
mel = mel[:, :, :pruned_len]
|
||||
|
||||
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
|
||||
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
|
||||
# From [bs, time] to [bs, n_group, time/n_group]
|
||||
audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1])
|
||||
# [bs, 1, n_group, time/n_group]
|
||||
audio = fluid.layers.unsqueeze(audio, 1)
|
||||
|
||||
log_s_list = []
|
||||
for i in range(self.n_flows):
|
||||
inputs = audio[:, :, :-1, :]
|
||||
conds = mel[:, :, 1:, :]
|
||||
outputs = self.flows[i](inputs, conds)
|
||||
log_s = outputs[:, :1, :, :]
|
||||
b = outputs[:, 1:, :, :]
|
||||
log_s_list.append(log_s)
|
||||
|
||||
audio_0 = audio[:, :, :1, :]
|
||||
audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b
|
||||
audio = fluid.layers.concat([audio_0, audio_out], axis=2)
|
||||
|
||||
# Permute over the height dim.
|
||||
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
|
||||
audio = fluid.layers.stack(audio_slices, axis=2)
|
||||
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
|
||||
mel = fluid.layers.stack(mel_slices, axis=2)
|
||||
|
||||
z = fluid.layers.squeeze(audio, [1])
|
||||
|
||||
return z, log_s_list
|
||||
|
||||
def synthesize(self, mel, sigma=1.0):
|
||||
mel = self.conditioner.infer(mel)
|
||||
# From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group]
|
||||
mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2])
|
||||
|
||||
audio = fluid.layers.gaussian_random(
|
||||
shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma)
|
||||
|
||||
for i in reversed(range(self.n_flows)):
|
||||
# Permute over the height dimension.
|
||||
audio_slices = [audio[:, :, j, :] for j in self.perms[i]]
|
||||
audio = fluid.layers.stack(audio_slices, axis=2)
|
||||
mel_slices = [mel[:, :, j, :] for j in self.perms[i]]
|
||||
mel = fluid.layers.stack(mel_slices, axis=2)
|
||||
|
||||
audio_list = []
|
||||
audio_0 = audio[:, :, 0:1, :]
|
||||
audio_list.append(audio_0)
|
||||
audio_h = audio_0
|
||||
queues = [[] for _ in range(self.n_layers)]
|
||||
|
||||
for h in range(1, self.n_group):
|
||||
inputs = audio_h
|
||||
conds = mel[:, :, h:(h+1), :]
|
||||
outputs = self.flows[i].infer(inputs, conds, queues)
|
||||
|
||||
log_s = outputs[:, 0:1, :, :]
|
||||
b = outputs[:, 1:, :, :]
|
||||
audio_h = (audio[:, :, h:(h+1), :] - b) / \
|
||||
fluid.layers.exp(log_s)
|
||||
audio_list.append(audio_h)
|
||||
|
||||
audio = fluid.layers.concat(audio_list, axis=2)
|
||||
|
||||
# audio: [bs, n_group, time/n_group]
|
||||
audio = fluid.layers.squeeze(audio, [1])
|
||||
# audio: [bs, time]
|
||||
audio = fluid.layers.reshape(
|
||||
fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1])
|
||||
|
||||
return audio
|
|
@ -0,0 +1,52 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
class DynamicGRU(dg.Layer):
|
||||
def __init__(self,
|
||||
size,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
is_reverse=False,
|
||||
gate_activation='sigmoid',
|
||||
candidate_activation='tanh',
|
||||
h_0=None,
|
||||
origin_mode=False,
|
||||
init_size=None):
|
||||
super(DynamicGRU, self).__init__()
|
||||
self.gru_unit = dg.GRUUnit(
|
||||
size * 3,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
activation=candidate_activation,
|
||||
gate_activation=gate_activation,
|
||||
origin_mode=origin_mode)
|
||||
self.size = size
|
||||
self.h_0 = h_0
|
||||
self.is_reverse = is_reverse
|
||||
|
||||
def forward(self, inputs):
|
||||
"""
|
||||
Dynamic GRU block.
|
||||
|
||||
Args:
|
||||
input (Variable): Shape(B, T, C), dtype: float32. The input value.
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the result compute by GRU.
|
||||
"""
|
||||
hidden = self.h_0
|
||||
res = []
|
||||
for i in range(inputs.shape[1]):
|
||||
if self.is_reverse:
|
||||
i = inputs.shape[1] - 1 - i
|
||||
input_ = inputs[:, i:i + 1, :]
|
||||
input_ = layers.reshape(
|
||||
input_, [-1, input_.shape[2]], inplace=False)
|
||||
hidden, reset, gate = self.gru_unit(input_, hidden)
|
||||
hidden_ = layers.reshape(
|
||||
hidden, [-1, 1, hidden.shape[1]], inplace=False)
|
||||
res.append(hidden_)
|
||||
if self.is_reverse:
|
||||
res = res[::-1]
|
||||
res = layers.concat(res, axis=1)
|
||||
return res
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
import paddle.fluid as fluid
|
||||
import math
|
||||
from parakeet.modules.layers import Conv
|
||||
|
||||
|
||||
class PositionwiseFeedForward(dg.Layer):
|
||||
''' A two-feed-forward-layer module '''
|
||||
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
|
||||
super(PositionwiseFeedForward, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
self.use_cudnn = use_cudnn
|
||||
self.dropout = dropout
|
||||
|
||||
self.w_1 = Conv(in_channels = d_in,
|
||||
out_channels = num_hidden,
|
||||
filter_size = filter_size,
|
||||
padding=padding,
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NTC")
|
||||
|
||||
self.w_2 = Conv(in_channels = num_hidden,
|
||||
out_channels = d_in,
|
||||
filter_size = filter_size,
|
||||
padding=padding,
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NTC")
|
||||
self.layer_norm = dg.LayerNorm(d_in)
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
Feed Forward Network.
|
||||
|
||||
Args:
|
||||
input (Variable): Shape(B, T, C), dtype: float32. The input value.
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the result after FFN.
|
||||
"""
|
||||
#FFN Networt
|
||||
x = self.w_2(layers.relu(self.w_1(input)))
|
||||
|
||||
# dropout
|
||||
x = layers.dropout(x, self.dropout)
|
||||
|
||||
# residual connection
|
||||
x = x + input
|
||||
|
||||
#layer normalization
|
||||
output = self.layer_norm(x)
|
||||
|
||||
return output
|
|
@ -0,0 +1,158 @@
|
|||
import math
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
from paddle import fluid
|
||||
import paddle.fluid.dygraph as dg
|
||||
|
||||
|
||||
class Conv(dg.Layer):
|
||||
def __init__(self, in_channels, out_channels, filter_size=1,
|
||||
padding=0, dilation=1, stride=1, use_cudnn=True,
|
||||
data_format="NCT", is_bias=True):
|
||||
super(Conv, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.filter_size = filter_size
|
||||
self.padding = padding
|
||||
self.dilation = dilation
|
||||
self.stride = stride
|
||||
self.use_cudnn = use_cudnn
|
||||
self.data_format = data_format
|
||||
self.is_bias = is_bias
|
||||
|
||||
self.weight_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer())
|
||||
self.bias_attr = None
|
||||
if is_bias is not False:
|
||||
k = math.sqrt(1 / in_channels)
|
||||
self.bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))
|
||||
|
||||
self.conv = Conv1D( in_channels = in_channels,
|
||||
out_channels = out_channels,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
dilation = dilation,
|
||||
stride = stride,
|
||||
param_attr = self.weight_attr,
|
||||
bias_attr = self.bias_attr,
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = data_format)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
class Conv1D(dg.Layer):
|
||||
"""
|
||||
A convolution 1D block implemented with Conv2D. Form simplicity and
|
||||
ensuring the output has the same length as the input, it does not allow
|
||||
stride > 1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
filter_size=3,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
stride=1,
|
||||
groups=None,
|
||||
param_attr=None,
|
||||
bias_attr=None,
|
||||
use_cudnn=True,
|
||||
act=None,
|
||||
data_format='NCT',
|
||||
dtype="float32"):
|
||||
super(Conv1D, self).__init__(dtype=dtype)
|
||||
|
||||
self.padding = padding
|
||||
self.in_channels = in_channels
|
||||
self.num_filters = out_channels
|
||||
self.filter_size = filter_size
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.padding = padding
|
||||
self.act = act
|
||||
self.data_format = data_format
|
||||
|
||||
self.conv = dg.Conv2D(
|
||||
num_channels=in_channels,
|
||||
num_filters=out_channels,
|
||||
filter_size=(1, filter_size),
|
||||
stride=(1, stride),
|
||||
dilation=(1, dilation),
|
||||
padding=(0, padding),
|
||||
groups=groups,
|
||||
param_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_cudnn=use_cudnn,
|
||||
act=act,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
x = fluid.layers.unsqueeze(x, [2])
|
||||
x = self.conv(x)
|
||||
x = fluid.layers.squeeze(x, [2])
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
||||
|
||||
class Pool1D(dg.Layer):
|
||||
"""
|
||||
A Pool 1D block implemented with Pool2D.
|
||||
"""
|
||||
def __init__(self,
|
||||
pool_size=-1,
|
||||
pool_type='max',
|
||||
pool_stride=1,
|
||||
pool_padding=0,
|
||||
global_pooling=False,
|
||||
use_cudnn=True,
|
||||
ceil_mode=False,
|
||||
exclusive=True,
|
||||
data_format='NCT'):
|
||||
super(Pool1D, self).__init__()
|
||||
self.pool_size = pool_size
|
||||
self.pool_type = pool_type
|
||||
self.pool_stride = pool_stride
|
||||
self.pool_padding = pool_padding
|
||||
self.global_pooling = global_pooling
|
||||
self.use_cudnn = use_cudnn
|
||||
self.ceil_mode = ceil_mode
|
||||
self.exclusive = exclusive
|
||||
self.data_format = data_format
|
||||
|
||||
|
||||
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
|
||||
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
|
||||
global_pooling = global_pooling, use_cudnn = use_cudnn,
|
||||
ceil_mode = ceil_mode, exclusive = exclusive)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
|
||||
input channels.
|
||||
Returns:
|
||||
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
|
||||
output channels (num_filters).
|
||||
"""
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
x = fluid.layers.unsqueeze(x, [2])
|
||||
x = self.pool2d(x)
|
||||
x = fluid.layers.squeeze(x, [2])
|
||||
if self.data_format == 'NTC':
|
||||
x = fluid.layers.transpose(x, [0, 2, 1])
|
||||
return x
|
|
@ -0,0 +1,112 @@
|
|||
import math
|
||||
import numpy as np
|
||||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
class ScaledDotProductAttention(dg.Layer):
|
||||
def __init__(self, d_key):
|
||||
super(ScaledDotProductAttention, self).__init__()
|
||||
|
||||
self.d_key = d_key
|
||||
|
||||
# please attention this mask is diff from pytorch
|
||||
def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
|
||||
"""
|
||||
Scaled Dot Product Attention.
|
||||
|
||||
Args:
|
||||
key (Variable): Shape(B, T, C), dtype: float32. The input key of attention.
|
||||
value (Variable): Shape(B, T, C), dtype: float32. The input value of attention.
|
||||
query (Variable): Shape(B, T, C), dtype: float32. The input query of attention.
|
||||
mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key.
|
||||
query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query.
|
||||
dropout (Constant): dtype: float32. The probability of dropout.
|
||||
Returns:
|
||||
result (Variable), Shape(B, T, C), the result of mutihead attention.
|
||||
attention (Variable), Shape(n_head * B, T, C), the attention of key.
|
||||
"""
|
||||
# Compute attention score
|
||||
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
|
||||
attention = attention / math.sqrt(self.d_key)
|
||||
|
||||
# Mask key to ignore padding
|
||||
if mask is not None:
|
||||
attention = attention * mask
|
||||
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
|
||||
attention = attention + mask
|
||||
|
||||
|
||||
attention = layers.softmax(attention)
|
||||
attention = layers.dropout(attention, dropout)
|
||||
# Mask query to ignore padding
|
||||
if query_mask is not None:
|
||||
attention = attention * query_mask
|
||||
|
||||
result = layers.matmul(attention, value)
|
||||
return result, attention
|
||||
|
||||
class MultiheadAttention(dg.Layer):
|
||||
def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1):
|
||||
super(MultiheadAttention, self).__init__()
|
||||
self.num_hidden = num_hidden
|
||||
self.num_head = num_head
|
||||
self.d_k = d_k
|
||||
self.d_q = d_q
|
||||
self.dropout = dropout
|
||||
|
||||
self.key = dg.Linear(num_hidden, num_head * d_k)
|
||||
self.value = dg.Linear(num_hidden, num_head * d_k)
|
||||
self.query = dg.Linear(num_hidden, num_head * d_q)
|
||||
|
||||
self.scal_attn = ScaledDotProductAttention(d_k)
|
||||
|
||||
self.fc = dg.Linear(num_head * d_q, num_hidden)
|
||||
|
||||
self.layer_norm = dg.LayerNorm(num_hidden)
|
||||
|
||||
def forward(self, key, value, query_input, mask=None, query_mask=None):
|
||||
"""
|
||||
Multihead Attention.
|
||||
|
||||
Args:
|
||||
key (Variable): Shape(B, T, C), dtype: float32. The input key of attention.
|
||||
value (Variable): Shape(B, T, C), dtype: float32. The input value of attention.
|
||||
query_input (Variable): Shape(B, T, C), dtype: float32. The input query of attention.
|
||||
mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key.
|
||||
query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query.
|
||||
Returns:
|
||||
result (Variable), Shape(B, T, C), the result of mutihead attention.
|
||||
attention (Variable), Shape(n_head * B, T, C), the attention of key.
|
||||
"""
|
||||
batch_size = key.shape[0]
|
||||
seq_len_key = key.shape[1]
|
||||
seq_len_query = query_input.shape[1]
|
||||
|
||||
# repeat masks h times
|
||||
if query_mask is not None:
|
||||
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
|
||||
if mask is not None:
|
||||
mask = layers.expand(mask, (self.num_head, 1, 1))
|
||||
|
||||
|
||||
# Make multihead attention
|
||||
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
|
||||
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
|
||||
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
|
||||
|
||||
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
|
||||
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
|
||||
|
||||
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
|
||||
|
||||
# concat all multihead result
|
||||
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
|
||||
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
|
||||
|
||||
result = layers.dropout(self.fc(result), self.dropout)
|
||||
result = result + query_input
|
||||
|
||||
result = self.layer_norm(result)
|
||||
return result, attention
|
|
@ -0,0 +1,75 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.layers as layers
|
||||
from parakeet.modules.layers import Conv
|
||||
|
||||
class PostConvNet(dg.Layer):
|
||||
def __init__(self,
|
||||
n_mels=80,
|
||||
num_hidden=512,
|
||||
filter_size=5,
|
||||
padding=0,
|
||||
num_conv=5,
|
||||
outputs_per_step=1,
|
||||
use_cudnn=True,
|
||||
dropout=0.1):
|
||||
super(PostConvNet, self).__init__()
|
||||
|
||||
self.dropout = dropout
|
||||
self.conv_list = []
|
||||
self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step,
|
||||
out_channels = num_hidden,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT"))
|
||||
|
||||
for _ in range(1, num_conv-1):
|
||||
self.conv_list.append(Conv(in_channels = num_hidden,
|
||||
out_channels = num_hidden,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT") )
|
||||
|
||||
self.conv_list.append(Conv(in_channels = num_hidden,
|
||||
out_channels = n_mels * outputs_per_step,
|
||||
filter_size = filter_size,
|
||||
padding = padding,
|
||||
use_cudnn = use_cudnn,
|
||||
data_format = "NCT"))
|
||||
|
||||
for i, layer in enumerate(self.conv_list):
|
||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||
|
||||
self.batch_norm_list = [dg.BatchNorm(num_hidden,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
moving_mean_name = 'moving_mean',
|
||||
moving_variance_name = 'moving_var',
|
||||
data_layout='NCHW') for _ in range(num_conv-1)]
|
||||
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
|
||||
param_attr = fluid.ParamAttr(name='weight'),
|
||||
bias_attr = fluid.ParamAttr(name='bias'),
|
||||
moving_mean_name = 'moving_mean',
|
||||
moving_variance_name = 'moving_var',
|
||||
data_layout='NCHW'))
|
||||
for i, layer in enumerate(self.batch_norm_list):
|
||||
self.add_sublayer("batch_norm_list_{}".format(i), layer)
|
||||
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
Post Conv Net.
|
||||
|
||||
Args:
|
||||
input (Variable): Shape(B, T, C), dtype: float32. The input value.
|
||||
Returns:
|
||||
output (Variable), Shape(B, T, C), the result after postconvnet.
|
||||
"""
|
||||
input = layers.transpose(input, [0,2,1])
|
||||
len = input.shape[-1]
|
||||
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
|
||||
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
|
||||
output = layers.transpose(input, [0,2,1])
|
||||
return output
|
|
@ -0,0 +1,31 @@
|
|||
import paddle.fluid.dygraph as dg
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
class PreNet(dg.Layer):
|
||||
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
|
||||
"""
|
||||
:param input_size: dimension of input
|
||||
:param hidden_size: dimension of hidden unit
|
||||
:param output_size: dimension of output
|
||||
"""
|
||||
super(PreNet, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.dropout_rate = dropout_rate
|
||||
|
||||
self.linear1 = dg.Linear(input_size, hidden_size)
|
||||
self.linear2 = dg.Linear(hidden_size, output_size)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Pre Net before passing through the network.
|
||||
|
||||
Args:
|
||||
x (Variable): Shape(B, T, C), dtype: float32. The input value.
|
||||
Returns:
|
||||
x (Variable), Shape(B, T, C), the result after pernet.
|
||||
"""
|
||||
x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate)
|
||||
x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate)
|
||||
return x
|
|
@ -0,0 +1,73 @@
|
|||
import numpy as np
|
||||
import librosa
|
||||
import os, copy
|
||||
from scipy import signal
|
||||
import paddle.fluid.layers as layers
|
||||
|
||||
|
||||
def get_positional_table(d_pos_vec, n_position=1024):
|
||||
position_enc = np.array([
|
||||
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
|
||||
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
|
||||
|
||||
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
|
||||
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
||||
return position_enc
|
||||
|
||||
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
|
||||
''' Sinusoid position encoding table '''
|
||||
|
||||
def cal_angle(position, hid_idx):
|
||||
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
|
||||
|
||||
def get_posi_angle_vec(position):
|
||||
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
|
||||
|
||||
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
|
||||
if padding_idx is not None:
|
||||
# zero vector for padding dimension
|
||||
sinusoid_table[padding_idx] = 0.
|
||||
|
||||
return sinusoid_table
|
||||
|
||||
def get_non_pad_mask(seq):
|
||||
return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
|
||||
|
||||
def get_attn_key_pad_mask(seq_k, seq_q):
|
||||
''' For masking out the padding part of key sequence. '''
|
||||
|
||||
# Expand to fit the shape of key query attention matrix.
|
||||
len_q = seq_q.shape[1]
|
||||
padding_mask = (seq_k != 0).astype(np.float32)
|
||||
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1])
|
||||
return padding_mask
|
||||
|
||||
def get_triu_tensor(seq_k, seq_q):
|
||||
''' For make a triu tensor '''
|
||||
len_k = seq_k.shape[1]
|
||||
len_q = seq_q.shape[1]
|
||||
batch_size = seq_k.shape[0]
|
||||
triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
|
||||
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
|
||||
|
||||
return triu_tensor
|
||||
|
||||
def guided_attention(N, T, g=0.2):
|
||||
'''Guided attention. Refer to page 3 on the paper.'''
|
||||
W = np.zeros((N, T), dtype=np.float32)
|
||||
for n_pos in range(W.shape[0]):
|
||||
for t_pos in range(W.shape[1]):
|
||||
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
|
||||
return W
|
||||
|
||||
|
||||
def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001):
|
||||
input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
|
||||
label = input * (label * (position_weight - 1) + 1)
|
||||
return layers.reduce_sum(label, dim=[0, 1])
|
||||
|
||||
|
Loading…
Reference in New Issue