diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py new file mode 100644 index 0000000..6212dee --- /dev/null +++ b/parakeet/audio/__init__.py @@ -0,0 +1 @@ +from .audio import AudioProcessor \ No newline at end of file diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py new file mode 100644 index 0000000..6b84701 --- /dev/null +++ b/parakeet/audio/audio.py @@ -0,0 +1,261 @@ +import librosa +import soundfile as sf +import numpy as np +import scipy.io +import scipy.signal + +class AudioProcessor(object): + def __init__(self, + sample_rate=None, # int, sampling rate + num_mels=None, # int, bands of mel spectrogram + min_level_db=None, # float, minimum level db + ref_level_db=None, # float, reference level db + n_fft=None, # int: number of samples in a frame for stft + win_length=None, # int: the same meaning with n_fft + hop_length=None, # int: number of samples between neighboring frame + power=None, # float:power to raise before griffin-lim + preemphasis=None, # float: preemphasis coefficident + signal_norm=None, # + symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] + max_norm=None, # float, max norm + mel_fmin=None, # int: mel spectrogram's minimum frequency + mel_fmax=None, # int: mel spectrogram's maximum frequency + clip_norm=True, # bool: clip spectrogram's norm + griffin_lim_iters=None, # int: + do_trim_silence=False, # bool: trim silence + sound_norm=False, + **kwargs): + self.sample_rate = sample_rate + self.num_mels = num_mels + self.min_level_db = min_level_db + self.ref_level_db = ref_level_db + + # stft related + self.n_fft = n_fft + self.win_length = win_length or n_fft + # hop length defaults to 1/4 window_length + self.hop_length = hop_length or 0.25 * self.win_length + + self.power = power + self.preemphasis = float(preemphasis) + + self.griffin_lim_iters = griffin_lim_iters + self.signal_norm = signal_norm + self.symmetric_norm = symmetric_norm + + # mel transform related + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + + self.max_norm = 1.0 if max_norm is None else float(max_norm) + self.clip_norm = clip_norm + self.do_trim_silence = do_trim_silence + + self.sound_norm = sound_norm + self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters() + + def _stft_parameters(self): + """compute frame length and hop length in ms""" + frame_length_ms = self.win_length * 1. / self.sample_rate + frame_shift_ms = self.hop_length * 1. / self.sample_rate + num_freq = 1 + self.n_fft // 2 + return num_freq, frame_length_ms, frame_shift_ms + + def __repr__(self): + """object repr""" + cls_name_str = self.__class__.__name__ + members = vars(self) + dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()]) + repr_str = "{}(\n{})\n".format(cls_name_str, dict_str) + return repr_str + + def save_wav(self, path, wav): + """save audio with scipy.io.wavfile in 16bit integers""" + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16)) + + def load_wav(self, path, sr=None): + """load wav -> trim_silence -> rescale""" + + x, sr = librosa.load(path, sr=None) + assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate) + if self.do_trim_silence: + try: + x = self.trim_silence(x) + except ValueError: + print(" [!] File cannot be trimmed for silence - {}".format(path)) + if self.sound_norm: + x = x / x.max() * 0.9 # why 0.9 ? + return x + + def trim_silence(self, wav): + """Trim soilent parts with a threshold and 0.01s margin""" + margin = int(self.sample_rate * 0.01) + wav = wav[margin: -margin] + trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] + return trimed_wav + + def apply_preemphasis(self, x): + if self.preemphasis == 0.: + raise RuntimeError(" !! Preemphasis coefficient should be positive. ") + return scipy.signal.lfilter([1., -self.preemphasis], [1.], x) + + def apply_inv_preemphasis(self, x): + if self.preemphasis == 0.: + raise RuntimeError(" !! Preemphasis coefficient should be positive. ") + return scipy.signal.lfilter([1.], [1., -self.preemphasis], x) + + def _amplitude_to_db(self, x): + amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) + return 20 * np.log10(np.maximum(amplitude_min, x)) + + @staticmethod + def _db_to_amplitude(x): + return np.power(10., 0.05 * x) + + def _linear_to_mel(self, spectrogram): + _mel_basis = self._build_mel_basis() + return np.dot(_mel_basis, spectrogram) + + def _mel_to_linear(self, mel_spectrogram): + inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spectrogram)) + + def _build_mel_basis(self): + """return mel basis for mel scale""" + if self.mel_fmax is not None: + assert self.mel_fmax <= self.sample_rate // 2 + return librosa.filters.mel( + self.sample_rate, + self.n_fft, + n_mels=self.num_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax) + + def _normalize(self, S): + """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]""" + if self.signal_norm: + S_norm = (S - self.min_level_db) / (-self.min_level_db) + if self.symmetric_norm: + S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm + if self.clip_norm: + S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) + return S_norm + else: + S_norm = self.max_norm * S_norm + if self.clip_norm: + S_norm = np.clip(S_norm, 0, self.max_norm) + return S_norm + else: + return S + + def _denormalize(self, S): + """denormalize values""" + S_denorm = S + if self.signal_norm: + if self.symmetric_norm: + if self.clip_norm: + S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) + S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db + return S_denorm + else: + if self.clip_norm: + S_denorm = np.clip(S_denorm, 0, self.max_norm) + S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db + return S_denorm + else: + return S + + def _stft(self, y): + return librosa.stft( + y=y, + n_fft=self.n_fft, + win_length=self.win_length, + hop_length=self.hop_length) + + def _istft(self, S): + return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length) + + def spectrogram(self, y): + """compute linear spectrogram(amplitude) + preemphasis -> stft -> mag -> amplitude_to_db -> minus_ref_level_db -> normalize + """ + if self.preemphasis: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + S = self._amplitude_to_db(np.abs(D)) - self.ref_level_db + return self._normalize(S) + + def melspectrogram(self, y): + """compute linear spectrogram(amplitude) + preemphasis -> stft -> mag -> mel_scale -> amplitude_to_db -> minus_ref_level_db -> normalize + """ + if self.preemphasis: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db + return self._normalize(S) + + def inv_spectrogram(self, spectrogram): + """convert spectrogram back to waveform using griffin_lim in librosa""" + S = self._denormalize(spectrogram) + S = self._db_to_amplitude(S + self.ref_level_db) + if self.preemphasis: + return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) + return self._griffin_lim(S ** self.power) + + def inv_melspectrogram(self, mel_spectrogram): + S = self._denormalize(mel_spectrogram) + S = self._db_to_amplitude(S + self.ref_level_db) + S = self._linear_to_mel(np.abs(S)) + if self.preemphasis: + return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) + return self._griffin_lim(S ** self.power) + + def out_linear_to_mel(self, linear_spec): + """convert output linear spec to mel spec""" + S = self._denormalize(linear_spec) + S = self._db_to_amplitude(S + self.ref_level_db) + S = self._linear_to_mel(np.abs(S)) + S = self._amplitude_to_db(S) - self.ref_level_db + mel = self._normalize(S) + return mel + + def _griffin_lim(self, S): + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = self._istft(S_complex * angles) + for _ in range(self.griffin_lim_iters): + angles = np.exp(1j * np.angle(self._stft(y))) + y = self._istft(S_complex * angles) + return y + + @staticmethod + def mulaw_encode(wav, qc): + mu = 2 ** qc - 1 + # wav_abs = np.minimum(np.abs(wav), 1.0) + signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) + # Quantize signal to the specified number of levels. + signal = (signal + 1) / 2 * mu + 0.5 + return np.floor(signal,) + + @staticmethod + def mulaw_decode(wav, qc): + """Recovers waveform from quantized values.""" + mu = 2 ** qc - 1 + x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) + return x + + @staticmethod + def encode_16bits(x): + return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) + + @staticmethod + def quantize(x, bits): + return (x + 1.) * (2**bits - 1) / 2 + + @staticmethod + def dequantize(x, bits): + return 2 * x / (2**bits - 1) - 1 diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 9303b46..8777472 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -88,7 +88,7 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): mono_channel = False lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame) - max_len = np.max(lengths) + max_len = np.max(lengths) batch = [] for example in minibatch: diff --git a/parakeet/data/datacargo.py b/parakeet/data/datacargo.py index 1d7d8d5..e087a4f 100644 --- a/parakeet/data/datacargo.py +++ b/parakeet/data/datacargo.py @@ -2,7 +2,8 @@ from .sampler import SequentialSampler, RandomSampler, BatchSampler class DataCargo(object): def __init__(self, dataset, batch_size=1, sampler=None, - shuffle=False, batch_sampler=None, drop_last=False): + shuffle=False, batch_sampler=None, collate_fn=None, + drop_last=False): self.dataset = dataset if batch_sampler is not None: @@ -21,13 +22,20 @@ class DataCargo(object): sampler = RandomSampler(dataset) else: sampler = SequentialSampler(dataset) - # auto_collation without custom batch_sampler batch_sampler = BatchSampler(sampler, batch_size, drop_last) + else: + batch_sampler = BatchSampler(sampler, batch_size, drop_last) + + self.batch_sampler = batch_sampler + + if collate_fn is None: + collate_fn = dataset._batch_examples + self.collate_fn = collate_fn self.batch_size = batch_size self.drop_last = drop_last self.sampler = sampler - self.batch_sampler = batch_sampler + def __iter__(self): return DataIterator(self) @@ -57,6 +65,7 @@ class DataIterator(object): self._index_sampler = loader._index_sampler self._sampler_iter = iter(self._index_sampler) + self.collate_fn = loader.collate_fn def __iter__(self): return self @@ -64,7 +73,7 @@ class DataIterator(object): def __next__(self): index = self._next_index() # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size minibatch = [self._dataset[i] for i in index] # we can abstract it, too to use dynamic batch size - minibatch = self._dataset._batch_examples(minibatch) # list[Example] -> Batch + minibatch = self.collate_fn(minibatch) return minibatch def _next_index(self): diff --git a/parakeet/models/dataloader/__init__.py b/parakeet/models/dataloader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/dataloader/jlspeech.py b/parakeet/models/dataloader/jlspeech.py new file mode 100644 index 0000000..ef55b0f --- /dev/null +++ b/parakeet/models/dataloader/jlspeech.py @@ -0,0 +1,148 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa + +from paddle import fluid +from parakeet import g2p +from parakeet import audio +from parakeet.data.sampler import * +from parakeet.data.datacargo import DataCargo +from parakeet.data.dataset import Dataset +from parakeet.data.batch import TextIDBatcher, SpecBatcher + +class LJSpeechLoader: + def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + LJSPEECH_ROOT = Path(config.data_path) + dataset = LJSpeech(LJSPEECH_ROOT, config) + sampler = DistributedSampler(len(dataset), nranks, rank, shuffle=shuffle) + + assert config.batch_size % nranks == 0 + each_bs = config.batch_size // nranks + if is_vocoder: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples_vocoder, drop_last=True) + else: + dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, collate_fn=batch_examples, drop_last=True) + + self.reader = fluid.io.DataLoader.from_generator( + capacity=32, + iterable=True, + use_double_buffer=True, + return_list=True) + self.reader.set_batch_generator(dataloader, place) + + +class LJSpeech(Dataset): + def __init__(self, root, config): + super(LJSpeech, self).__init__() + assert isinstance(root, (str, Path)), "root should be a string or Path object" + self.root = root if isinstance(root, Path) else Path(root) + self.metadata = self._prepare_metadata() + self.config = config + self._ljspeech_processor = audio.AudioProcessor( + sample_rate=config.audio.sr, + num_mels=config.audio.num_mels, + min_level_db=config.audio.min_level_db, + ref_level_db=config.audio.ref_level_db, + n_fft=config.audio.n_fft, + win_length= config.audio.win_length, + hop_length= config.audio.hop_length, + power=config.audio.power, + preemphasis=config.audio.preemphasis, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + + def _prepare_metadata(self): + csv_path = self.root.joinpath("metadata.csv") + metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, + names=["fname", "raw_text", "normalized_text"]) + return metadata + + def _get_example(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + + fname, raw_text, normalized_text = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = self._ljspeech_processor.load_wav(str(wav_path)) + mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __len__(self): + return len(self.metadata) + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) #(B, T) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) + +def batch_examples_vocoder(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + + + diff --git a/parakeet/models/fastspeech/__init__.py b/parakeet/models/fastspeech/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parakeet/models/fastspeech/config/fastapeech.yaml b/parakeet/models/fastspeech/config/fastapeech.yaml new file mode 100644 index 0000000..3e62846 --- /dev/null +++ b/parakeet/models/fastspeech/config/fastapeech.yaml @@ -0,0 +1,41 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +encoder_output_size: 384 +word_vec_dim: 384 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +decoder_output_size: 384 +d_model: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 + + +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +image_step: 2000 +use_gpu: False +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +transtts_path: ./checkpoint +transformer_step: 70000 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/config/fastspeech.yaml b/parakeet/models/fastspeech/config/fastspeech.yaml new file mode 100644 index 0000000..947457b --- /dev/null +++ b/parakeet/models/fastspeech/config/fastspeech.yaml @@ -0,0 +1,43 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +encoder_n_layer: 6 +encoder_head: 2 +encoder_conv1d_filter_size: 1536 +max_sep_len: 2048 +encoder_output_size: 384 +embedding_size: 384 +decoder_n_layer: 6 +decoder_head: 2 +decoder_conv1d_filter_size: 1536 +decoder_output_size: 384 +hidden_size: 384 +duration_predictor_output_size: 256 +duration_predictor_filter_size: 3 +fft_conv1d_filter: 3 +fft_conv1d_padding: 1 +dropout: 0.1 +transformer_head: 4 + +warm_up_step: 4000 +grad_clip_thresh: 0.1 +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +transtts_path: ../transformerTTS/checkpoint +transformer_step: 20 +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/fastspeech/modules.py b/parakeet/models/fastspeech/modules.py new file mode 100644 index 0000000..621b5c1 --- /dev/null +++ b/parakeet/models/fastspeech/modules.py @@ -0,0 +1,150 @@ +import numpy as np +import math +import utils +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parakeet.modules.layers import Conv1D +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward + + + +class FFTBlock(dg.Layer): + def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): + super(FFTBlock, self).__init__() + self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, dropout=dropout) + self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) + + def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): + """ + Feed Forward Transformer block in FastSpeech. + + Args: + enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input. + T means the timesteps of input. + non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence. + slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention. + len_q means the sequence length of query, len_k means the sequence length of key. + + Returns: + output (Variable), Shape(B, T, C), the output after self-attention & ffn. + slf_attn (Variable), Shape(B * n_head, T, T), the self attention. + """ + output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) + output *= non_pad_mask + + output = self.pos_ffn(output) + output *= non_pad_mask + + return output, slf_attn + + +class LengthRegulator(dg.Layer): + def __init__(self, input_size, out_channels, filter_size, dropout=0.1): + super(LengthRegulator, self).__init__() + self.duration_predictor = DurationPredictor(input_size=input_size, + out_channels=out_channels, + filter_size=filter_size, + dropout=dropout) + + def LR(self, x, duration_predictor_output, alpha=1.0): + output = [] + batch_size = x.shape[0] + for i in range(batch_size): + output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha)) + output = self.pad(output) + return output + + def pad(self, input_ele): + max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) + out_list = [] + for i in range(len(input_ele)): + pad_len = max_len - input_ele[i].shape[0] + one_batch_padded = layers.pad( + input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) + out_list.append(one_batch_padded) + out_padded = layers.stack(out_list) + return out_padded + + def expand(self, batch, predicted, alpha): + out = [] + time_steps = batch.shape[1] + fertilities = predicted.numpy() + batch = layers.squeeze(batch,[0]) + + + for i in range(time_steps): + if fertilities[0,i]==0: + continue + out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1])) + out = layers.concat(out, axis=0) + return out + + + def forward(self, x, alpha=1.0, target=None): + """ + Length Regulator block in FastSpeech. + + Args: + x (Variable): Shape(B, T, C), dtype: float32. The encoder output. + alpha (Constant): dtype: float32. The hyperparameter to determine the length of + the expanded sequence mel, thereby controlling the voice speed. + target (Variable): (Variable, optional): Shape(B, T_text), + dtype: int64. The duration of phoneme compute from pretrained transformerTTS. + + Returns: + output (Variable), Shape(B, T, C), the output after exppand. + duration_predictor_output (Variable), Shape(B, T, C), the output of duration predictor. + """ + duration_predictor_output = self.duration_predictor(x) + if fluid.framework._dygraph_tracer()._train_mode: + output = self.LR(x, target) + return output, duration_predictor_output + else: + duration_predictor_output = layers.round(duration_predictor_output) + output = self.LR(x, duration_predictor_output, alpha) + mel_pos = dg.to_variable([i+1 for i in range(output.shape[1])]) + return output, mel_pos + +class DurationPredictor(dg.Layer): + def __init__(self, input_size, out_channels, filter_size, dropout=0.1): + super(DurationPredictor, self).__init__() + self.input_size = input_size + self.out_channels = out_channels + self.filter_size = filter_size + self.dropout = dropout + + self.conv1 = Conv1D(in_channels = self.input_size, + out_channels = self.out_channels, + filter_size = self.filter_size, + padding=1, + data_format='NTC') + self.conv2 = Conv1D(in_channels = self.out_channels, + out_channels = self.out_channels, + filter_size = self.filter_size, + padding=1, + data_format='NTC') + self.layer_norm1 = dg.LayerNorm(self.out_channels) + self.layer_norm2 = dg.LayerNorm(self.out_channels) + + self.linear =dg.Linear(self.out_channels, 1) + + def forward(self, encoder_output): + """ + Duration Predictor block in FastSpeech. + + Args: + encoder_output (Variable): Shape(B, T, C), dtype: float32. The encoder output. + Returns: + out (Variable), Shape(B, T, C), the output of duration predictor. + """ + # encoder_output.shape(N, T, C) + out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout) + out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout) + out = layers.relu(self.linear(out)) + out = layers.squeeze(out, axes=[-1]) + + return out + + diff --git a/parakeet/models/fastspeech/network.py b/parakeet/models/fastspeech/network.py new file mode 100644 index 0000000..2f8dc9a --- /dev/null +++ b/parakeet/models/fastspeech/network.py @@ -0,0 +1,214 @@ +from utils import * +from modules import * +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +from parakeet.g2p.text.symbols import symbols +from parakeet.modules.utils import * +from parakeet.modules.post_convnet import PostConvNet + +class Encoder(dg.Layer): + def __init__(self, + n_src_vocab, + len_max_seq, + d_word_vec, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Encoder, self).__init__() + n_position = len_max_seq + 1 + + self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_word_vec], padding_idx=0) + self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, character, text_pos): + """ + Encoder layer of FastSpeech. + + Args: + character (Variable): Shape(B, T_text), dtype: float32. The input text + characters. T_text means the timesteps of input characters. + text_pos (Variable): Shape(B, T_text), dtype: int64. The input text + position. T_text means the timesteps of input characters. + + Returns: + enc_output (Variable), Shape(B, text_T, C), the encoder output. + non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad. + enc_slf_attn_list (list), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. + """ + enc_slf_attn_list = [] + # -- prepare masks + # shape character (N, T) + slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) + non_pad_mask = get_non_pad_mask(character) + + # -- Forward + enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) + + for enc_layer in self.layer_stack: + enc_output, enc_slf_attn = enc_layer( + enc_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + enc_slf_attn_list += [enc_slf_attn] + + return enc_output, non_pad_mask, enc_slf_attn_list + +class Decoder(dg.Layer): + def __init__(self, + len_max_seq, + d_word_vec, + n_layers, + n_head, + d_k, + d_v, + d_model, + d_inner, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=0.1): + super(Decoder, self).__init__() + + n_position = len_max_seq + 1 + self.pos_inp = get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0) + self.position_enc = dg.Embedding(size=[n_position, d_word_vec], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + for i, layer in enumerate(self.layer_stack): + self.add_sublayer('fft_{}'.format(i), layer) + + def forward(self, enc_seq, enc_pos): + """ + Decoder layer of FastSpeech. + + Args: + enc_seq (Variable), Shape(B, text_T, C), dtype: float32. + The output of length regulator. + enc_pos (Variable, optional): Shape(B, T_mel), + dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum. + Returns: + dec_output (Variable), Shape(B, mel_T, C), the decoder output. + dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. + """ + dec_slf_attn_list = [] + + # -- Prepare masks + slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) + non_pad_mask = get_non_pad_mask(enc_pos) + + # -- Forward + dec_output = enc_seq + self.position_enc(enc_pos) + + for dec_layer in self.layer_stack: + dec_output, dec_slf_attn = dec_layer( + dec_output, + non_pad_mask=non_pad_mask, + slf_attn_mask=slf_attn_mask) + dec_slf_attn_list += [dec_slf_attn] + + return dec_output, dec_slf_attn_list + +class FastSpeech(dg.Layer): + def __init__(self, cfg): + " FastSpeech" + super(FastSpeech, self).__init__() + + self.encoder = Encoder(n_src_vocab=len(symbols)+1, + len_max_seq=cfg.max_sep_len, + d_word_vec=cfg.embedding_size, + n_layers=cfg.encoder_n_layer, + n_head=cfg.encoder_head, + d_k=64, + d_v=64, + d_model=cfg.hidden_size, + d_inner=cfg.encoder_conv1d_filter_size, + fft_conv1d_kernel=cfg.fft_conv1d_filter, + fft_conv1d_padding=cfg.fft_conv1d_padding, + dropout=0.1) + self.length_regulator = LengthRegulator(input_size=cfg.hidden_size, + out_channels=cfg.duration_predictor_output_size, + filter_size=cfg.duration_predictor_filter_size, + dropout=cfg.dropout) + self.decoder = Decoder(len_max_seq=cfg.max_sep_len, + d_word_vec=cfg.embedding_size, + n_layers=cfg.decoder_n_layer, + n_head=cfg.decoder_head, + d_k=64, + d_v=64, + d_model=cfg.hidden_size, + d_inner=cfg.decoder_conv1d_filter_size, + fft_conv1d_kernel=cfg.fft_conv1d_filter, + fft_conv1d_padding=cfg.fft_conv1d_padding, + dropout=0.1) + self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels) + self.postnet = PostConvNet(n_mels=80, + num_hidden=512, + filter_size=5, + padding=int(5 / 2), + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1) + + def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): + """ + FastSpeech model. + + Args: + character (Variable): Shape(B, T_text), dtype: float32. The input text + characters. T_text means the timesteps of input characters. + text_pos (Variable): Shape(B, T_text), dtype: int64. The input text + position. T_text means the timesteps of input characters. + mel_pos (Variable, optional): Shape(B, T_mel), + dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum. + length_target (Variable, optional): Shape(B, T_text), + dtype: int64. The duration of phoneme compute from pretrained transformerTTS. + alpha (Constant): + dtype: float32. The hyperparameter to determine the length of the expanded sequence + mel, thereby controlling the voice speed. + + Returns: + mel_output (Variable), Shape(B, mel_T, C), the mel output before postnet. + mel_output_postnet (Variable), Shape(B, mel_T, C), the mel output after postnet. + duration_predictor_output (Variable), Shape(B, text_T), the duration of phoneme compute + with duration predictor. + enc_slf_attn_list (Variable), Shape(B, text_T, text_T), the encoder self attention list. + dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. + """ + + encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) + if fluid.framework._dygraph_tracer()._train_mode: + + length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, + target=length_target, + alpha=alpha) + decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos) + + mel_output = self.mel_linear(decoder_output) + mel_output_postnet = self.postnet(mel_output) + mel_output + + return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list + else: + length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) + decoder_output = self.decoder(length_regulator_output, decoder_pos) + + mel_output = self.mel_linear(decoder_output) + mel_output_postnet = self.postnet(mel_output) + mel_output + + return mel_output, mel_output_postnet \ No newline at end of file diff --git a/parakeet/models/fastspeech/parse.py b/parakeet/models/fastspeech/parse.py new file mode 100644 index 0000000..4132674 --- /dev/null +++ b/parakeet/models/fastspeech/parse.py @@ -0,0 +1,93 @@ +import jsonargparse + +def add_config_options_to_parser(parser): + parser.add_argument('--audio.num_mels', type=int, default=80, + help="the number of mel bands when calculating mel spectrograms.") + parser.add_argument('--audio.n_fft', type=int, default=2048, + help="the number of fft components.") + parser.add_argument('--audio.sr', type=int, default=22050, + help="the sampling rate of audio data file.") + parser.add_argument('--audio.preemphasis', type=float, default=0.97, + help="the preemphasis coefficient.") + parser.add_argument('--audio.hop_length', type=int, default=128, + help="the number of samples to advance between frames.") + parser.add_argument('--audio.win_length', type=int, default=1024, + help="the length (width) of the window function.") + parser.add_argument('--audio.power', type=float, default=1.4, + help="the power to raise before griffin-lim.") + parser.add_argument('--audio.min_level_db', type=int, default=-100, + help="the minimum level db.") + parser.add_argument('--audio.ref_level_db', type=int, default=20, + help="the reference level db.") + parser.add_argument('--audio.outputs_per_step', type=int, default=1, + help="the outputs per step.") + + parser.add_argument('--embedding_size', type=int, default=256, + help="the dim size of embedding.") + parser.add_argument('--encoder_n_layer', type=int, default=6, + help="the number of FFT Block in encoder.") + parser.add_argument('--encoder_head', type=int, default=2, + help="the attention head number in encoder.") + parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024, + help="the filter size of conv1d in encoder.") + parser.add_argument('--max_sep_len', type=int, default=2048, + help="the max length of sequence.") + parser.add_argument('--encoder_output_size', type=int, default=256, + help="the output channel size of encoder.") + parser.add_argument('--decoder_n_layer', type=int, default=6, + help="the number of FFT Block in decoder.") + parser.add_argument('--decoder_head', type=int, default=2, + help="the attention head number in decoder.") + parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, + help="the filter size of conv1d in decoder.") + parser.add_argument('--decoder_output_size', type=int, default=256, + help="the output channel size of decoder.") + parser.add_argument('--hidden_size', type=int, default=256, + help="the hidden size in model.") + parser.add_argument('--duration_predictor_output_size', type=int, default=256, + help="the output size of duration predictior.") + parser.add_argument('--duration_predictor_filter_size', type=int, default=3, + help="the filter size of conv1d in duration prediction.") + parser.add_argument('--fft_conv1d_filter', type=int, default=3, + help="the filter size of conv1d in fft.") + parser.add_argument('--fft_conv1d_padding', type=int, default=1, + help="the padding size of conv1d in fft.") + parser.add_argument('--dropout', type=float, default=0.1, + help="the dropout in network.") + parser.add_argument('--transformer_head', type=int, default=4, + help="the attention head num of transformerTTS.") + + parser.add_argument('--warm_up_step', type=int, default=4000, + help="the warm up step of learning rate.") + parser.add_argument('--grad_clip_thresh', type=float, default=1.0, + help="the threshold of grad clip.") + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--use_gpu', type=bool, default=True, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=bool, default=False, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./sample', + help="the directory to save audio sample in synthesis.") + parser.add_argument('--transtts_path', type=str, default='./log', + help="the directory to load pretrain transformerTTS model.") + parser.add_argument('--transformer_step', type=int, default=70000, + help="the step to load transformerTTS model.") + + + parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/parakeet/models/fastspeech/train.py b/parakeet/models/fastspeech/train.py new file mode 100644 index 0000000..243631c --- /dev/null +++ b/parakeet/models/fastspeech/train.py @@ -0,0 +1,139 @@ +import numpy as np +import argparse +import os +import time +import math +import jsonargparse +from pathlib import Path +from tqdm import tqdm +from tensorboardX import SummaryWriter +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +from parse import add_config_options_to_parser +from pprint import pprint +from network import FastSpeech +from utils import get_alignment +from parakeet.models.dataloader.jlspeech import LJSpeechLoader +from parakeet.models.transformerTTS.network import TransformerTTS + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + +def main(cfg): + + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'fastspeech') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + transformerTTS = TransformerTTS(cfg) + model_path = os.path.join(cfg.transtts_path, "transformer") + model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, str(cfg.transformer_step))) + #for param in transformerTTS.state_dict(): + # print(param) + + transformerTTS.set_dict(model_dict) + transformerTTS.eval() + + model = FastSpeech(cfg) + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + parameter_list=model.parameters()) + reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + pbar = tqdm(reader) + + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + character, mel, mel_input, pos_text, pos_mel, text_length = data + + _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) + alignment = dg.to_variable(get_alignment(attn_probs, cfg.transformer_head)).astype(np.float32) + + global_step += 1 + + #Forward + result= model(character, + pos_text, + mel_pos=pos_mel, + length_target=alignment) + mel_output, mel_output_postnet, duration_predictor_output, _, _ = result + mel_loss = layers.mse_loss(mel_output, mel) + mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) + duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment))) + total_loss = mel_loss + mel_postnet_loss + duration_loss + + if local_rank==0: + print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy())) + + writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) + writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) + writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + + if cfg.use_data_parallel: + total_loss = model.scale_loss(total_loss) + total_loss.backward() + model.apply_collective_grads() + else: + total_loss.backward() + optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + model.clear_gradients() + + # save checkpoint + if local_rank==0 and global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() + + +if __name__ =='__main__': + parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) + main(cfg) \ No newline at end of file diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py new file mode 100644 index 0000000..7517a13 --- /dev/null +++ b/parakeet/models/fastspeech/utils.py @@ -0,0 +1,32 @@ +import numpy as np + +def get_alignment(attn_probs, n_head): + max_F = 0 + assert attn_probs[0].shape[0] % n_head == 0 + batch_size = int(attn_probs[0].shape[0] // n_head) + for i in range(len(attn_probs)): + multi_attn = attn_probs[i].numpy() + for j in range(n_head): + attn = multi_attn[j*batch_size:(j+1)*batch_size] + F = score_F(attn) + if max_F < F: + max_F = F + max_attn = attn + alignment = compute_duration(max_attn) + return alignment + +def score_F(attn): + max = np.max(attn, axis=-1) + mean = np.mean(max) + return mean + +def compute_duration(attn): + alignment = np.zeros([attn.shape[0],attn.shape[2]]) + for i in range(attn.shape[0]): + for j in range(attn.shape[1]): + max_index = attn[i,j].tolist().index(attn[i,j].max()) + alignment[i,max_index] += 1 + + return alignment + + diff --git a/parakeet/models/transformerTTS/config/synthesis.yaml b/parakeet/models/transformerTTS/config/synthesis.yaml new file mode 100644 index 0000000..c3c3f8c --- /dev/null +++ b/parakeet/models/transformerTTS/config/synthesis.yaml @@ -0,0 +1,20 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +max_len: 50 +transformer_step: 1 +postnet_step: 1 +use_gpu: True + +checkpoint_path: ./checkpoint +log_dir: ./log +sample_path: ./sample \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/train_postnet.yaml b/parakeet/models/transformerTTS/config/train_postnet.yaml new file mode 100644 index 0000000..74e1b5a --- /dev/null +++ b/parakeet/models/transformerTTS/config/train_postnet.yaml @@ -0,0 +1,27 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + +hidden_size: 256 +embedding_size: 512 + +warm_up_step: 4000 +grad_clip_thresh: 1.0 +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 500 +use_gpu: True +use_data_parallel: True + +data_path: ../../../dataset/LJSpeech-1.1 +save_path: ./checkpoint +log_dir: ./log \ No newline at end of file diff --git a/parakeet/models/transformerTTS/config/train_transformer.yaml b/parakeet/models/transformerTTS/config/train_transformer.yaml new file mode 100644 index 0000000..0fbde62 --- /dev/null +++ b/parakeet/models/transformerTTS/config/train_transformer.yaml @@ -0,0 +1,33 @@ +audio: + num_mels: 80 + n_fft: 2048 + sr: 22050 + preemphasis: 0.97 + hop_length: 275 + win_length: 1102 + power: 1.2 + min_level_db: -100 + ref_level_db: 20 + outputs_per_step: 1 + + +hidden_size: 256 +embedding_size: 512 + + +warm_up_step: 4000 +grad_clip_thresh: 1.0 +batch_size: 32 +epochs: 10000 +lr: 0.001 +save_step: 1000 +image_step: 2000 +use_gpu: True +use_data_parallel: False + +data_path: ../../../dataset/LJSpeech-1.1 +save_path: ./checkpoint +log_dir: ./log +#checkpoint_path: ./checkpoint/transformer/1 + + \ No newline at end of file diff --git a/parakeet/models/transformerTTS/layers.py b/parakeet/models/transformerTTS/layers.py new file mode 100644 index 0000000..7a8e97e --- /dev/null +++ b/parakeet/models/transformerTTS/layers.py @@ -0,0 +1,166 @@ +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + in_channels, + num_filters, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + in_channels=in_channels, + num_filters=num_filters, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT', + dtype='float32'): + super(Pool1D, self).__init__(dtype=dtype) + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + self.dtype = dtype + + + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class DynamicGRU(dg.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = dg.GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + diff --git a/parakeet/models/transformerTTS/module.py b/parakeet/models/transformerTTS/module.py new file mode 100644 index 0000000..ecacb1b --- /dev/null +++ b/parakeet/models/transformerTTS/module.py @@ -0,0 +1,242 @@ +import math +from parakeet.g2p.text.symbols import symbols +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from parakeet.modules.layers import Conv, Pool1D +from parakeet.modules.dynamicGRU import DynamicGRU +import numpy as np + + + +class EncoderPrenet(dg.Layer): + def __init__(self, embedding_size, num_hidden, use_cudnn=True): + super(EncoderPrenet, self).__init__() + self.embedding_size = embedding_size + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.embedding = dg.Embedding( size = [len(symbols), embedding_size], + param_attr = fluid.ParamAttr(name='weight'), + padding_idx = None) + self.conv_list = [] + self.conv_list.append(Conv(in_channels = embedding_size, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT")) + for _ in range(2): + self.conv_list.append(Conv(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = 5, + padding = int(np.floor(5/2)), + use_cudnn = use_cudnn, + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batch_norm_list = [dg.BatchNorm(num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') for _ in range(3)] + + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + + self.projection = dg.Linear(num_hidden, num_hidden) + + def forward(self, x): + x = self.embedding(x) #(batch_size, seq_len, embending_size) + x = layers.transpose(x,[0,2,1]) + for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) + x = layers.transpose(x,[0,2,1]) #(N,T,C) + x = self.projection(x) + return x + +class CBHG(dg.Layer): + def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, + max_pool_kernel_size=2, is_post=False): + super(CBHG, self).__init__() + """ + :param hidden_size: dimension of hidden unit + :param K: # of convolution banks + :param projection_size: dimension of projection unit + :param num_gru_layers: # of layers of GRUcell + :param max_pool_kernel_size: max pooling kernel size + :param is_post: whether post processing or not + """ + self.hidden_size = hidden_size + self.projection_size = projection_size + self.conv_list = [] + self.conv_list.append(Conv(in_channels = projection_size, + out_channels = hidden_size, + filter_size = 1, + padding = int(np.floor(1/2)), + data_format = "NCT")) + for i in range(2,K+1): + self.conv_list.append(Conv(in_channels = hidden_size, + out_channels = hidden_size, + filter_size = i, + padding = int(np.floor(i/2)), + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batchnorm_list = [] + for i in range(K): + self.batchnorm_list.append(dg.BatchNorm(hidden_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW')) + + for i, layer in enumerate(self.batchnorm_list): + self.add_sublayer("batchnorm_list_{}".format(i), layer) + + conv_outdim = hidden_size * K + + self.conv_projection_1 = Conv(in_channels = conv_outdim, + out_channels = hidden_size, + filter_size = 3, + padding = int(np.floor(3/2)), + data_format = "NCT") + + self.conv_projection_2 = Conv(in_channels = hidden_size, + out_channels = projection_size, + filter_size = 3, + padding = int(np.floor(3/2)), + data_format = "NCT") + + self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.batchnorm_proj_2 = dg.BatchNorm(projection_size, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') + self.max_pool = Pool1D(pool_size = max_pool_kernel_size, + pool_type='max', + pool_stride=1, + pool_padding=1, + data_format = "NCT") + self.highway = Highwaynet(self.projection_size) + + h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") + h_0 = dg.to_variable(h_0) + self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse = False, + origin_mode = True, + h_0 = h_0) + self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse=True, + origin_mode=True, + h_0 = h_0) + + self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3) + self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse = False, + origin_mode = True, + h_0 = h_0) + self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + is_reverse=True, + origin_mode=True, + h_0 = h_0) + + def _conv_fit_dim(self, x, filter_size=3): + if filter_size % 2 == 0: + return x[:,:,:-1] + else: + return x + + def forward(self, input_): + # input_.shape = [N, C, T] + + conv_list = [] + conv_input = input_ + + for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): + conv_input = self._conv_fit_dim(conv(conv_input), i+1) + conv_input = layers.relu(batchnorm(conv_input)) + conv_list.append(conv_input) + + conv_cat = layers.concat(conv_list, axis=1) + conv_pool = self.max_pool(conv_cat)[:,:,:-1] + + + conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) + conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ + + # conv_proj.shape = [N, C, T] + highway = layers.transpose(conv_proj, [0,2,1]) + highway = self.highway(highway) + + # highway.shape = [N, T, C] + fc_forward = self.fc_forward1(highway) + fc_reverse = self.fc_reverse1(highway) + out_forward = self.gru_forward1(fc_forward) + out_reverse = self.gru_reverse1(fc_reverse) + out = layers.concat([out_forward, out_reverse], axis=-1) + fc_forward = self.fc_forward2(out) + fc_reverse = self.fc_reverse2(out) + out_forward = self.gru_forward2(fc_forward) + out_reverse = self.gru_reverse2(fc_reverse) + out = layers.concat([out_forward, out_reverse], axis=-1) + out = layers.transpose(out, [0,2,1]) + return out + +class Highwaynet(dg.Layer): + def __init__(self, num_units, num_layers=4): + super(Highwaynet, self).__init__() + self.num_units = num_units + self.num_layers = num_layers + + self.gates = [] + self.linears = [] + + for i in range(num_layers): + self.linears.append(dg.Linear(num_units, num_units)) + self.gates.append(dg.Linear(num_units, num_units)) + + for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): + self.add_sublayer("linears_{}".format(i), linear) + self.add_sublayer("gates_{}".format(i), gate) + + def forward(self, input_): + out = input_ + + for linear, gate in zip(self.linears, self.gates): + h = fluid.layers.relu(linear(out)) + t_ = fluid.layers.sigmoid(gate(out)) + + c = 1 - t_ + out = h * t_ + out * c + + return out + + + + + + diff --git a/parakeet/models/transformerTTS/network.py b/parakeet/models/transformerTTS/network.py new file mode 100644 index 0000000..5f353f8 --- /dev/null +++ b/parakeet/models/transformerTTS/network.py @@ -0,0 +1,206 @@ +from parakeet.models.transformerTTS.module import * +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +from parakeet.modules.layers import Conv1D +from parakeet.modules.utils import * +from parakeet.modules.multihead_attention import MultiheadAttention +from parakeet.modules.feed_forward import PositionwiseFeedForward +from parakeet.modules.prenet import PreNet +from parakeet.modules.post_convnet import PostConvNet + + +class Encoder(dg.Layer): + def __init__(self, embedding_size, num_hidden, config): + super(Encoder, self).__init__() + self.num_hidden = num_hidden + param = fluid.ParamAttr(name='alpha', + initializer=fluid.initializer.Constant(value=1.0)) + self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') + self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding(size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, + num_hidden = num_hidden, + use_cudnn=config.use_gpu) + self.layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] + for i, layer in enumerate(self.layers): + self.add_sublayer("self_attn_{}".format(i), layer) + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] + for i, layer in enumerate(self.ffns): + self.add_sublayer("ffns_{}".format(i), layer) + + def forward(self, x, positional): + if fluid.framework._dygraph_tracer()._train_mode: + query_mask = get_non_pad_mask(positional) + mask = get_attn_key_pad_mask(positional, x) + else: + query_mask, mask = None, None + + + # Encoder pre_network + x = self.encoder_prenet(x) #(N,T,C) + + + # Get positional encoding + positional = self.pos_emb(positional) + x = positional * self.alpha + x #(N, T, C) + + + # Positional dropout + x = layers.dropout(x, 0.1) + + # Self attention encoder + attentions = list() + for layer, ffn in zip(self.layers, self.ffns): + x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) + x = ffn(x) + attentions.append(attention) + + return x, query_mask, attentions + +class Decoder(dg.Layer): + def __init__(self, num_hidden, config): + super(Decoder, self).__init__() + self.num_hidden = num_hidden + param = fluid.ParamAttr(name='alpha') + self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', + default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) + self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding(size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + name='weight', + initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), + trainable=False)) + self.decoder_prenet = PreNet(input_size = config.audio.num_mels, + hidden_size = num_hidden * 2, + output_size = num_hidden, + dropout_rate=0.2) + self.linear = dg.Linear(num_hidden, num_hidden) + + self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] + for i, layer in enumerate(self.selfattn_layers): + self.add_sublayer("self_attn_{}".format(i), layer) + self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] + for i, layer in enumerate(self.attn_layers): + self.add_sublayer("attn_{}".format(i), layer) + self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] + for i, layer in enumerate(self.ffns): + self.add_sublayer("ffns_{}".format(i), layer) + self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) + self.stop_linear = dg.Linear(num_hidden, 1) + + self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, + filter_size = 5, padding = 4, num_conv=5, + outputs_per_step=config.audio.outputs_per_step, + use_cudnn = config.use_gpu) + + def forward(self, key, value, query, c_mask, positional): + + # get decoder mask with triangular matrix + + if fluid.framework._dygraph_tracer()._train_mode: + m_mask = get_non_pad_mask(positional) + mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query) + triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) + mask = mask + triu_tensor + mask = fluid.layers.cast(mask == 0, np.float32) + + # (batch_size, decoder_len, encoder_len) + zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) + else: + mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) + mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) + m_mask, zero_mask = None, None + + # Decoder pre-network + query = self.decoder_prenet(query) + + # Centered position + query = self.linear(query) + + # Get position embedding + positional = self.pos_emb(positional) + query = positional * self.alpha + query + + #positional dropout + query = fluid.layers.dropout(query, 0.1) + + # Attention decoder-decoder, encoder-decoder + selfattn_list = list() + attn_list = list() + + for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): + query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) + query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) + query = ffn(query) + selfattn_list.append(attn_dec) + attn_list.append(attn_dot) + + # Mel linear projection + mel_out = self.mel_linear(query) + # Post Mel Network + out = self.postconvnet(mel_out) + out = mel_out + out + + # Stop tokens + stop_tokens = self.stop_linear(query) + stop_tokens = layers.squeeze(stop_tokens, [-1]) + stop_tokens = layers.sigmoid(stop_tokens) + + return mel_out, out, attn_list, stop_tokens, selfattn_list + +class TransformerTTS(dg.Layer): + def __init__(self, config): + super(TransformerTTS, self).__init__() + self.encoder = Encoder(config.embedding_size, config.hidden_size, config) + self.decoder = Decoder(config.hidden_size, config) + self.config = config + + def forward(self, characters, mel_input, pos_text, pos_mel): + # key (batch_size, seq_len, channel) + # c_mask (batch_size, seq_len) + # attns_enc (channel / 2, seq_len, seq_len) + + key, c_mask, attns_enc = self.encoder(characters, pos_text) + + # mel_output/postnet_output (batch_size, mel_len, n_mel) + # attn_probs (128, mel_len, seq_len) + # stop_preds (batch_size, mel_len, 1) + # attns_dec (128, mel_len, mel_len) + mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel) + + return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec + +class ModelPostNet(dg.Layer): + """ + CBHG Network (mel -> linear) + """ + def __init__(self, config): + super(ModelPostNet, self).__init__() + self.pre_proj = Conv1D(in_channels = config.audio.num_mels, + out_channels = config.hidden_size, + filter_size=1, + data_format = "NCT") + self.cbhg = CBHG(config.hidden_size, config.batch_size) + self.post_proj = Conv1D(in_channels = config.hidden_size, + out_channels = (config.audio.n_fft // 2) + 1, + filter_size=1, + data_format = "NCT") + + def forward(self, mel): + mel = layers.transpose(mel, [0,2,1]) + mel = self.pre_proj(mel) + mel = self.cbhg(mel) + mag_pred = self.post_proj(mel) + mag_pred = layers.transpose(mag_pred, [0,2,1]) + return mag_pred + + + + + + diff --git a/parakeet/models/transformerTTS/parse.py b/parakeet/models/transformerTTS/parse.py new file mode 100644 index 0000000..584ea63 --- /dev/null +++ b/parakeet/models/transformerTTS/parse.py @@ -0,0 +1,67 @@ +import jsonargparse + +def add_config_options_to_parser(parser): + parser.add_argument('--audio.num_mels', type=int, default=80, + help="the number of mel bands when calculating mel spectrograms.") + parser.add_argument('--audio.n_fft', type=int, default=2048, + help="the number of fft components.") + parser.add_argument('--audio.sr', type=int, default=22050, + help="the sampling rate of audio data file.") + parser.add_argument('--audio.preemphasis', type=float, default=0.97, + help="the preemphasis coefficient.") + parser.add_argument('--audio.hop_length', type=int, default=128, + help="the number of samples to advance between frames.") + parser.add_argument('--audio.win_length', type=int, default=1024, + help="the length (width) of the window function.") + parser.add_argument('--audio.power', type=float, default=1.4, + help="the power to raise before griffin-lim.") + parser.add_argument('--audio.min_level_db', type=int, default=-100, + help="the minimum level db.") + parser.add_argument('--audio.ref_level_db', type=int, default=20, + help="the reference level db.") + parser.add_argument('--audio.outputs_per_step', type=int, default=1, + help="the outputs per step.") + + parser.add_argument('--hidden_size', type=int, default=256, + help="the hidden size in network.") + parser.add_argument('--embedding_size', type=int, default=512, + help="the embedding vector size.") + + parser.add_argument('--warm_up_step', type=int, default=4000, + help="the warm up step of learning rate.") + parser.add_argument('--grad_clip_thresh', type=float, default=1.0, + help="the threshold of grad clip.") + parser.add_argument('--batch_size', type=int, default=32, + help="batch size for training.") + parser.add_argument('--epochs', type=int, default=10000, + help="the number of epoch for training.") + parser.add_argument('--lr', type=float, default=0.001, + help="the learning rate for training.") + parser.add_argument('--save_step', type=int, default=500, + help="checkpointing interval during training.") + parser.add_argument('--image_step', type=int, default=2000, + help="attention image interval during training.") + parser.add_argument('--max_len', type=int, default=400, + help="The max length of audio when synthsis.") + parser.add_argument('--transformer_step', type=int, default=160000, + help="Global step to restore checkpoint of transformer in synthesis.") + parser.add_argument('--postnet_step', type=int, default=100000, + help="Global step to restore checkpoint of postnet in synthesis.") + parser.add_argument('--use_gpu', type=bool, default=True, + help="use gpu or not during training.") + parser.add_argument('--use_data_parallel', type=bool, default=False, + help="use data parallel or not during training.") + + parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + help="the path of dataset.") + parser.add_argument('--checkpoint_path', type=str, default=None, + help="the path to load checkpoint or pretrain model.") + parser.add_argument('--save_path', type=str, default='./checkpoint', + help="the path to save checkpoint.") + parser.add_argument('--log_dir', type=str, default='./log', + help="the directory to save tensorboard log.") + parser.add_argument('--sample_path', type=str, default='./log', + help="the directory to save audio sample in synthesis.") + + + parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile) diff --git a/parakeet/models/transformerTTS/preprocess.py b/parakeet/models/transformerTTS/preprocess.py new file mode 100644 index 0000000..b128b00 --- /dev/null +++ b/parakeet/models/transformerTTS/preprocess.py @@ -0,0 +1,123 @@ +from pathlib import Path +import numpy as np +import pandas as pd +import librosa + +from parakeet import g2p +from parakeet import audio + +from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler +from parakeet.data.dataset import Dataset +from parakeet.data.datacargo import DataCargo +from parakeet.data.batch import TextIDBatcher, SpecBatcher + +_ljspeech_processor = audio.AudioProcessor( + sample_rate=22050, + num_mels=80, + min_level_db=-100, + ref_level_db=20, + n_fft=2048, + win_length= int(22050 * 0.05), + hop_length= int(22050 * 0.0125), + power=1.2, + preemphasis=0.97, + signal_norm=True, + symmetric_norm=False, + max_norm=1., + mel_fmin=0, + mel_fmax=None, + clip_norm=True, + griffin_lim_iters=60, + do_trim_silence=False, + sound_norm=False) + +class LJSpeech(Dataset): + def __init__(self, root): + super(LJSpeech, self).__init__() + assert isinstance(root, (str, Path)), "root should be a string or Path object" + self.root = root if isinstance(root, Path) else Path(root) + self.metadata = self._prepare_metadata() + + def _prepare_metadata(self): + csv_path = self.root.joinpath("metadata.csv") + metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3, + names=["fname", "raw_text", "normalized_text"]) + return metadata + + def _get_example(self, metadatum): + """All the code for generating an Example from a metadatum. If you want a + different preprocessing pipeline, you can override this method. + This method may require several processor, each of which has a lot of options. + In this case, you'd better pass a composed transform and pass it to the init + method. + """ + + fname, raw_text, normalized_text = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize + wav = _ljspeech_processor.load_wav(str(wav_path)) + mag = _ljspeech_processor.spectrogram(wav).astype(np.float32) + mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32) + phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __len__(self): + return len(self.metadata) + + +def batch_examples(batch): + texts = [] + mels = [] + mel_inputs = [] + text_lens = [] + pos_texts = [] + pos_mels = [] + for data in batch: + _, mel, text = data + mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + text_lens.append(len(text)) + pos_texts.append(np.arange(1, len(text) + 1)) + pos_mels.append(np.arange(1, mel.shape[1] + 1)) + mels.append(mel) + texts.append(text) + + # Sort by text_len in descending order + texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] + mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] + mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] + pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] + pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + text_lens = sorted(text_lens, reverse=True) + + # Pad sequence with largest len of the batch + texts = TextIDBatcher(pad_id=0)(texts) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1)) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens)) + +def batch_examples_vocoder(batch): + mels=[] + mags=[] + for data in batch: + mag, mel, _ = data + mels.append(mel) + mags.append(mag) + + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + + return (mels, mags) + + diff --git a/parakeet/models/transformerTTS/synthesis.py b/parakeet/models/transformerTTS/synthesis.py new file mode 100644 index 0000000..9c89d16 --- /dev/null +++ b/parakeet/models/transformerTTS/synthesis.py @@ -0,0 +1,67 @@ +import os +from scipy.io.wavfile import write +from parakeet.g2p.en import text_to_sequence +import numpy as np +from network import Model, ModelPostNet +from tqdm import tqdm +from tensorboardX import SummaryWriter +import paddle.fluid as fluid +import paddle.fluid.dygraph as dg +from preprocess import _ljspeech_processor +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint + +def load_checkpoint(step, model_path): + model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + return model_dict + +def synthesis(text_input, cfg): + place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) + + # tensorboard + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'synthesis') + + writer = SummaryWriter(path) + + with dg.guard(place): + model = Model(cfg) + model_postnet = ModelPostNet(cfg) + + model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) + model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) + + # init input + text = np.asarray(text_to_sequence(text_input)) + text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) + mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32) + pos_text = np.arange(1, text.shape[1]+1) + pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) + + + model.eval() + model_postnet.eval() + + pbar = tqdm(range(cfg.max_len)) + + for i in pbar: + pos_mel = np.arange(1, mel_input.shape[1]+1) + pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0]) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel) + mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) + mag_pred = model_postnet(postnet_pred) + + wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) + writer.add_audio(text_input, wav, 0, cfg.audio.sr) + if not os.path.exists(cfg.sample_path): + os.mkdir(cfg.sample_path) + write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) + +if __name__ == '__main__': + parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) + synthesis("Transformer model is so fast!", cfg) \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_postnet.py b/parakeet/models/transformerTTS/train_postnet.py new file mode 100644 index 0000000..fe0f379 --- /dev/null +++ b/parakeet/models/transformerTTS/train_postnet.py @@ -0,0 +1,111 @@ +from network import * +from tensorboardX import SummaryWriter +import os +from tqdm import tqdm +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint +from parakeet.models.dataloader.jlspeech import LJSpeechLoader + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + + +def main(cfg): + + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'postnet') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + model = ModelPostNet(cfg) + + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + parameter_list=model.parameters()) + + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() + + for epoch in range(cfg.epochs): + pbar = tqdm(reader) + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + mel, mag = data + mag = dg.to_variable(mag.numpy()) + mel = dg.to_variable(mel.numpy()) + global_step += 1 + + mag_pred = model(mel) + loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) + + if cfg.use_data_parallel: + loss = model.scale_loss(loss) + loss.backward() + model.apply_collective_grads() + else: + loss.backward() + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + print("===============",model.pre_proj.conv.weight.numpy()) + print("===============",model.pre_proj.conv.weight.gradient()) + model.clear_gradients() + + if local_rank==0: + writer.add_scalars('training_loss',{ + 'loss':loss.numpy(), + }, global_step) + + if global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + + if local_rank==0: + writer.close() + +if __name__ == '__main__': + parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split()) + main(cfg) \ No newline at end of file diff --git a/parakeet/models/transformerTTS/train_transformer.py b/parakeet/models/transformerTTS/train_transformer.py new file mode 100644 index 0000000..8b177cd --- /dev/null +++ b/parakeet/models/transformerTTS/train_transformer.py @@ -0,0 +1,150 @@ +import os +from tqdm import tqdm +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +from network import * +from tensorboardX import SummaryWriter +from pathlib import Path +import jsonargparse +from parse import add_config_options_to_parser +from pprint import pprint +from matplotlib import cm +from parakeet.modules.utils import cross_entropy +from parakeet.models.dataloader.jlspeech import LJSpeechLoader + +class MyDataParallel(dg.parallel.DataParallel): + """ + A data parallel proxy for model. + """ + + def __init__(self, layers, strategy): + super(MyDataParallel, self).__init__(layers, strategy) + + def __getattr__(self, key): + if key in self.__dict__: + return object.__getattribute__(self, key) + elif key is "_layers": + return object.__getattribute__(self, "_sub_layers")["_layers"] + else: + return getattr( + object.__getattribute__(self, "_sub_layers")["_layers"], key) + + +def main(cfg): + local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 + nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 + + fluid.default_startup_program().random_seed = 1 + fluid.default_main_program().random_seed = 1 + + if local_rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(cfg)) + + global_step = 0 + place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) + if cfg.use_data_parallel else fluid.CUDAPlace(0) + if cfg.use_gpu else fluid.CPUPlace()) + + if not os.path.exists(cfg.log_dir): + os.mkdir(cfg.log_dir) + path = os.path.join(cfg.log_dir,'transformer') + + writer = SummaryWriter(path) if local_rank == 0 else None + + with dg.guard(place): + model = TransformerTTS(cfg) + + model.train() + optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), + parameter_list=model.parameters()) + + reader = LJSpeechLoader(cfg, nranks, local_rank).reader() + + if cfg.checkpoint_path is not None: + model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path) + model.set_dict(model_dict) + optimizer.set_dict(opti_dict) + print("load checkpoint!!!") + + if cfg.use_data_parallel: + strategy = dg.parallel.prepare_context() + model = MyDataParallel(model, strategy) + + for epoch in range(cfg.epochs): + pbar = tqdm(reader) + + + for i, data in enumerate(pbar): + pbar.set_description('Processing at epoch %d'%epoch) + character, mel, mel_input, pos_text, pos_mel, text_length = data + + global_step += 1 + + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) + + label = np.zeros(stop_preds.shape).astype(np.float32) + text_length = text_length.numpy() + for i in range(label.shape[0]): + label[i][text_length[i] - 1] = 1 + + mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) + post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) + stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) + loss = mel_loss + post_mel_loss + stop_loss + + if local_rank==0: + writer.add_scalars('training_loss', { + 'mel_loss':mel_loss.numpy(), + 'post_mel_loss':post_mel_loss.numpy(), + 'stop_loss':stop_loss.numpy() + }, global_step) + + writer.add_scalars('alphas', { + 'encoder_alpha':model.encoder.alpha.numpy(), + 'decoder_alpha':model.decoder.alpha.numpy(), + }, global_step) + + writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + + if global_step % cfg.image_step == 1: + for i, prob in enumerate(attn_probs): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) + writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") + + if cfg.use_data_parallel: + loss = model.scale_loss(loss) + loss.backward() + model.apply_collective_grads() + else: + loss.backward() + optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) + model.clear_gradients() + + # save checkpoint + if local_rank==0 and global_step % cfg.save_step == 0: + if not os.path.exists(cfg.save_path): + os.mkdir(cfg.save_path) + save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step) + dg.save_dygraph(model.state_dict(), save_path) + dg.save_dygraph(optimizer.state_dict(), save_path) + if local_rank==0: + writer.close() + + +if __name__ =='__main__': + parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') + add_config_options_to_parser(parser) + cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split()) + main(cfg) \ No newline at end of file diff --git a/parakeet/models/waveflow/README.md b/parakeet/models/waveflow/README.md new file mode 100644 index 0000000..d8072b1 --- /dev/null +++ b/parakeet/models/waveflow/README.md @@ -0,0 +1,111 @@ +# WaveFlow with Paddle Fluid + +Paddle fluid implementation of [WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219). + +## Project Structure +```text +├── configs # yaml configuration files of preset model hyperparameters +├── benchmark.py # benchmark code to test the speed of batched speech synthesis +├── data.py # dataset and dataloader settings for LJSpeech +├── synthesis.py # script for speech synthesis +├── train.py # script for model training +├── utils.py # helper functions for e.g., model checkpointing +├── waveflow.py # WaveFlow model high level APIs +└── waveflow_modules.py # WaveFlow model implementation +``` + +## Usage + +There are many hyperparameters to be tuned depending on the specification of model and dataset you are working on. +We provide `wavenet_ljspeech.yaml` as a hyperparameter set that works well on the LJSpeech dataset. + +Note that `train.py`, `synthesis.py`, and `benchmark.py` all accept a `--config` parameter. To ensure consistency, you should use the same config yaml file for both training, synthesizing and benchmarking. You can also overwrite these preset hyperparameters with command line by updating parameters after `--config`. +For example `--config=${yaml} --batch_size=8` can overwrite the corresponding hyperparameters in the `${yaml}` config file. For more details about these hyperparameters, check `utils.add_config_options_to_parser`. + +Note that you also need to specify some additional parameters for `train.py`, `synthesis.py`, and `benchmark.py`, and the details can be found in `train.add_options_to_parser`, `synthesis.add_options_to_parser`, and `benchmark.add_options_to_parser`, respectively. + +### Dataset + +Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). + +```bash +wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +tar xjvf LJSpeech-1.1.tar.bz2 +``` + +In this example, assume that the path of unzipped LJSpeech dataset is `./data/LJSpeech-1.1`. + +### Train on single GPU + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u train.py \ + --config=./configs/waveflow_ljspeech.yaml \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --batch_size=4 \ + --parallel=false --use_gpu=true +``` + +#### Save and Load checkpoints + +Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default. +The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. + +There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): +1. Use `--checkpoint=./runs/waveflow/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. +2. Use `--iteration=500000`. +3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/waveflow/${ModelName}/checkpoint`. + +### Train on multiple GPUs + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -u -m paddle.distributed.launch train.py \ + --config=./configs/waveflow_ljspeech.yaml \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --parallel=true --use_gpu=true +``` + +Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode. + +### Monitor with Tensorboard + +By default, the logs are saved in `./runs/waveflow/${ModelName}/logs/`. You can monitor logs by tensorboard. + +```bash +tensorboard --logdir=${log_dir} --port=8888 +``` + +### Synthesize from a checkpoint + +Check the [Save and load checkpoint](#save-and-load-checkpoints) section on how to load a specific checkpoint. +The following example will automatically load the latest checkpoint: + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u synthesis.py \ + --config=./configs/waveflow_ljspeech.yaml \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --use_gpu=true \ + --output=./syn_audios \ + --sample=${SAMPLE} \ + --sigma=1.0 +``` + +In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. + +### Benchmarking + +Use the following example to benchmark the speed of batched speech synthesis, which reports how many times faster than real-time: + +```bash +export PYTHONPATH="${PYTHONPATH}:${PWD}/../../.." +export CUDA_VISIBLE_DEVICES=0 +python -u benchmark.py \ + --config=./configs/waveflow_ljspeech.yaml \ + --root=./data/LJSpeech-1.1 \ + --name=${ModelName} --use_gpu=true +``` \ No newline at end of file diff --git a/parakeet/models/waveflow/benchmark.py b/parakeet/models/waveflow/benchmark.py new file mode 100644 index 0000000..b2949d2 --- /dev/null +++ b/parakeet/models/waveflow/benchmark.py @@ -0,0 +1,71 @@ +import os +import random +from pprint import pprint + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid + +import utils +from waveflow import WaveFlow + + +def add_options_to_parser(parser): + parser.add_argument('--model', type=str, default='waveflow', + help="general name of the model") + parser.add_argument('--name', type=str, + help="specific name of the training model") + parser.add_argument('--root', type=str, + help="root path of the LJSpeech dataset") + + parser.add_argument('--use_gpu', type=bool, default=True, + help="option to use gpu training") + + parser.add_argument('--iteration', type=int, default=None, + help=("which iteration of checkpoint to load, " + "default to load the latest checkpoint")) + parser.add_argument('--checkpoint', type=str, default=None, + help="path of the checkpoint to load") + + +def benchmark(config): + pprint(jsonargparse.namespace_to_dict(config)) + + # Get checkpoint directory path. + run_dir = os.path.join("runs", config.model, config.name) + checkpoint_dir = os.path.join(run_dir, "checkpoint") + + # Configurate device. + place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace() + + with dg.guard(place): + # Fix random seed. + seed = config.seed + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + print("Random Seed: ", seed) + + # Build model. + model = WaveFlow(config, checkpoint_dir) + model.build(training=False) + + # Run model inference. + model.benchmark() + + +if __name__ == "__main__": + # Create parser. + parser = jsonargparse.ArgumentParser( + description="Synthesize audio using WaveNet model", + formatter_class='default_argparse') + add_options_to_parser(parser) + utils.add_config_options_to_parser(parser) + + # Parse argument from both command line and yaml config file. + # For conflicting updates to the same field, + # the preceding update will be overwritten by the following one. + config = parser.parse_args() + benchmark(config) diff --git a/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml b/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml new file mode 100644 index 0000000..d3548c4 --- /dev/null +++ b/parakeet/models/waveflow/configs/waveflow_ljspeech.yaml @@ -0,0 +1,24 @@ +valid_size: 16 +segment_length: 16000 +sample_rate: 22050 +fft_window_shift: 256 +fft_window_size: 1024 +fft_size: 1024 +mel_bands: 80 +mel_fmin: 0.0 +mel_fmax: 8000.0 + +seed: 1234 +learning_rate: 0.0002 +batch_size: 8 +test_every: 2000 +save_every: 10000 +max_iterations: 3000000 + +sigma: 1.0 +n_flows: 8 +n_group: 16 +n_layers: 8 +n_channels: 64 +kernel_h: 3 +kernel_w: 3 diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py new file mode 100644 index 0000000..d89fb7b --- /dev/null +++ b/parakeet/models/waveflow/data.py @@ -0,0 +1,131 @@ +import random + +import librosa +import numpy as np +from paddle import fluid + +from parakeet.datasets import ljspeech +from parakeet.data import dataset +from parakeet.data.batch import SpecBatcher, WavBatcher +from parakeet.data.datacargo import DataCargo +from parakeet.data.sampler import DistributedSampler, BatchSampler +from scipy.io.wavfile import read + + +class Dataset(ljspeech.LJSpeech): + def __init__(self, config): + super(Dataset, self).__init__(config.root) + self.config = config + + def _get_example(self, metadatum): + fname, _, _ = metadatum + wav_path = self.root.joinpath("wavs", fname + ".wav") + + loaded_sr, audio = read(wav_path) + assert loaded_sr == self.config.sample_rate + + return audio + + +class Subset(dataset.Dataset): + def __init__(self, dataset, indices, valid): + self.dataset = dataset + self.indices = indices + self.valid = valid + self.config = dataset.config + + def get_mel(self, audio): + spectrogram = librosa.core.stft( + audio, n_fft=self.config.fft_size, + hop_length=self.config.fft_window_shift, + win_length=self.config.fft_window_size) + spectrogram_magnitude = np.abs(spectrogram) + + # mel_filter_bank shape: [n_mels, 1 + n_fft/2] + mel_filter_bank = librosa.filters.mel( + sr=self.config.sample_rate, + n_fft=self.config.fft_size, + n_mels=self.config.mel_bands, + fmin=self.config.mel_fmin, + fmax=self.config.mel_fmax) + # mel shape: [n_mels, num_frames] + mel = np.dot(mel_filter_bank, spectrogram_magnitude) + + # Normalize mel. + clip_val = 1e-5 + ref_constant = 1 + mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant) + + return mel + + def __getitem__(self, idx): + audio = self.dataset[self.indices[idx]] + segment_length = self.config.segment_length + + if self.valid: + # whole audio for valid set + pass + else: + # audio shape: [len] + if audio.shape[0] >= segment_length: + max_audio_start = audio.shape[0] - segment_length + audio_start = random.randint(0, max_audio_start) + audio = audio[audio_start : (audio_start + segment_length)] + else: + audio = np.pad(audio, (0, segment_length - audio.shape[0]), + mode='constant', constant_values=0) + + # Normalize audio to the [-1, 1] range. + audio = audio.astype(np.float32) / 32768.0 + mel = self.get_mel(audio) + + return audio, mel + + def _batch_examples(self, batch): + audios = [sample[0] for sample in batch] + mels = [sample[1] for sample in batch] + + audios = WavBatcher(pad_value=0.0)(audios) + mels = SpecBatcher(pad_value=0.0)(mels) + + return audios, mels + + def __len__(self): + return len(self.indices) + + +class LJSpeech: + def __init__(self, config, nranks, rank): + place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() + + # Whole LJSpeech dataset. + ds = Dataset(config) + + # Split into train and valid dataset. + indices = list(range(len(ds))) + train_indices = indices[config.valid_size:] + valid_indices = indices[:config.valid_size] + random.shuffle(train_indices) + + # Train dataset. + trainset = Subset(ds, train_indices, valid=False) + sampler = DistributedSampler(len(trainset), nranks, rank) + total_bs = config.batch_size + assert total_bs % nranks == 0 + train_sampler = BatchSampler(sampler, total_bs // nranks, + drop_last=True) + trainloader = DataCargo(trainset, batch_sampler=train_sampler) + + trainreader = fluid.io.PyReader(capacity=50, return_list=True) + trainreader.decorate_batch_generator(trainloader, place) + self.trainloader = (data for _ in iter(int, 1) + for data in trainreader()) + + # Valid dataset. + validset = Subset(ds, valid_indices, valid=True) + # Currently only support batch_size = 1 for valid loader. + validloader = DataCargo(validset, batch_size=1, shuffle=False) + + validreader = fluid.io.PyReader(capacity=20, return_list=True) + validreader.decorate_batch_generator(validloader, place) + self.validloader = validreader diff --git a/parakeet/models/waveflow/synthesis.py b/parakeet/models/waveflow/synthesis.py new file mode 100644 index 0000000..e42e170 --- /dev/null +++ b/parakeet/models/waveflow/synthesis.py @@ -0,0 +1,85 @@ +import os +import random +from pprint import pprint + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid + +import utils +from waveflow import WaveFlow + + +def add_options_to_parser(parser): + parser.add_argument('--model', type=str, default='waveflow', + help="general name of the model") + parser.add_argument('--name', type=str, + help="specific name of the training model") + parser.add_argument('--root', type=str, + help="root path of the LJSpeech dataset") + + parser.add_argument('--use_gpu', type=bool, default=True, + help="option to use gpu training") + + parser.add_argument('--iteration', type=int, default=None, + help=("which iteration of checkpoint to load, " + "default to load the latest checkpoint")) + parser.add_argument('--checkpoint', type=str, default=None, + help="path of the checkpoint to load") + + parser.add_argument('--output', type=str, default="./syn_audios", + help="path to write synthesized audio files") + parser.add_argument('--sample', type=int, default=None, + help="which of the valid samples to synthesize audio") + + +def synthesize(config): + pprint(jsonargparse.namespace_to_dict(config)) + + # Get checkpoint directory path. + run_dir = os.path.join("runs", config.model, config.name) + checkpoint_dir = os.path.join(run_dir, "checkpoint") + + # Configurate device. + place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace() + + with dg.guard(place): + # Fix random seed. + seed = config.seed + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + print("Random Seed: ", seed) + + # Build model. + model = WaveFlow(config, checkpoint_dir) + model.build(training=False) + + # Obtain the current iteration. + if config.checkpoint is None: + if config.iteration is None: + iteration = utils.load_latest_checkpoint(checkpoint_dir) + else: + iteration = config.iteration + else: + iteration = int(config.checkpoint.split('/')[-1].split('-')[-1]) + + # Run model inference. + model.infer(iteration) + + +if __name__ == "__main__": + # Create parser. + parser = jsonargparse.ArgumentParser( + description="Synthesize audio using WaveNet model", + formatter_class='default_argparse') + add_options_to_parser(parser) + utils.add_config_options_to_parser(parser) + + # Parse argument from both command line and yaml config file. + # For conflicting updates to the same field, + # the preceding update will be overwritten by the following one. + config = parser.parse_args() + synthesize(config) diff --git a/parakeet/models/waveflow/train.py b/parakeet/models/waveflow/train.py new file mode 100644 index 0000000..89b787a --- /dev/null +++ b/parakeet/models/waveflow/train.py @@ -0,0 +1,114 @@ +import os +import random +import subprocess +import time +from pprint import pprint + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid +from tensorboardX import SummaryWriter + +import slurm +import utils +from waveflow import WaveFlow + + +def add_options_to_parser(parser): + parser.add_argument('--model', type=str, default='waveflow', + help="general name of the model") + parser.add_argument('--name', type=str, + help="specific name of the training model") + parser.add_argument('--root', type=str, + help="root path of the LJSpeech dataset") + + parser.add_argument('--parallel', type=bool, default=True, + help="option to use data parallel training") + parser.add_argument('--use_gpu', type=bool, default=True, + help="option to use gpu training") + + parser.add_argument('--iteration', type=int, default=None, + help=("which iteration of checkpoint to load, " + "default to load the latest checkpoint")) + parser.add_argument('--checkpoint', type=str, default=None, + help="path of the checkpoint to load") + + +def train(config): + use_gpu = config.use_gpu + parallel = config.parallel if use_gpu else False + + # Get the rank of the current training process. + rank = dg.parallel.Env().local_rank if parallel else 0 + nranks = dg.parallel.Env().nranks if parallel else 1 + + if rank == 0: + # Print the whole config setting. + pprint(jsonargparse.namespace_to_dict(config)) + + # Make checkpoint directory. + run_dir = os.path.join("runs", config.model, config.name) + checkpoint_dir = os.path.join(run_dir, "checkpoint") + os.makedirs(checkpoint_dir, exist_ok=True) + + # Create tensorboard logger. + tb = SummaryWriter(os.path.join(run_dir, "logs")) \ + if rank == 0 else None + + # Configurate device + place = fluid.CUDAPlace(rank) if use_gpu else fluid.CPUPlace() + + with dg.guard(place): + # Fix random seed. + seed = config.seed + random.seed(seed) + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + print("Random Seed: ", seed) + + # Build model. + model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb) + model.build() + + # Obtain the current iteration. + if config.checkpoint is None: + if config.iteration is None: + iteration = utils.load_latest_checkpoint(checkpoint_dir, rank) + else: + iteration = config.iteration + else: + iteration = int(config.checkpoint.split('/')[-1].split('-')[-1]) + + while iteration < config.max_iterations: + # Run one single training step. + model.train_step(iteration) + + iteration += 1 + + if iteration % config.test_every == 0: + # Run validation step. + model.valid_step(iteration) + + if rank == 0 and iteration % config.save_every == 0: + # Save parameters. + model.save(iteration) + + # Close TensorBoard. + if rank == 0: + tb.close() + + +if __name__ == "__main__": + # Create parser. + parser = jsonargparse.ArgumentParser(description="Train WaveFlow model", + formatter_class='default_argparse') + add_options_to_parser(parser) + utils.add_config_options_to_parser(parser) + + # Parse argument from both command line and yaml config file. + # For conflicting updates to the same field, + # the preceding update will be overwritten by the following one. + config = parser.parse_args() + train(config) diff --git a/parakeet/models/waveflow/utils.py b/parakeet/models/waveflow/utils.py new file mode 100644 index 0000000..3baeb60 --- /dev/null +++ b/parakeet/models/waveflow/utils.py @@ -0,0 +1,114 @@ +import itertools +import os +import time + +import jsonargparse +import numpy as np +import paddle.fluid.dygraph as dg + + +def add_config_options_to_parser(parser): + parser.add_argument('--valid_size', type=int, + help="size of the valid dataset") + parser.add_argument('--segment_length', type=int, + help="the length of audio clip for training") + parser.add_argument('--sample_rate', type=int, + help="sampling rate of audio data file") + parser.add_argument('--fft_window_shift', type=int, + help="the shift of fft window for each frame") + parser.add_argument('--fft_window_size', type=int, + help="the size of fft window for each frame") + parser.add_argument('--fft_size', type=int, + help="the size of fft filter on each frame") + parser.add_argument('--mel_bands', type=int, + help="the number of mel bands when calculating mel spectrograms") + parser.add_argument('--mel_fmin', type=float, + help="lowest frequency in calculating mel spectrograms") + parser.add_argument('--mel_fmax', type=float, + help="highest frequency in calculating mel spectrograms") + + parser.add_argument('--seed', type=int, + help="seed of random initialization for the model") + parser.add_argument('--learning_rate', type=float) + parser.add_argument('--batch_size', type=int, + help="batch size for training") + parser.add_argument('--test_every', type=int, + help="test interval during training") + parser.add_argument('--save_every', type=int, + help="checkpointing interval during training") + parser.add_argument('--max_iterations', type=int, + help="maximum training iterations") + + parser.add_argument('--sigma', type=float, + help="standard deviation of the latent Gaussian variable") + parser.add_argument('--n_flows', type=int, + help="number of flows") + parser.add_argument('--n_group', type=int, + help="number of adjacent audio samples to squeeze into one column") + parser.add_argument('--n_layers', type=int, + help="number of conv2d layer in one wavenet-like flow architecture") + parser.add_argument('--n_channels', type=int, + help="number of residual channels in flow") + parser.add_argument('--kernel_h', type=int, + help="height of the kernel in the conv2d layer") + parser.add_argument('--kernel_w', type=int, + help="width of the kernel in the conv2d layer") + + parser.add_argument('--config', action=jsonargparse.ActionConfigFile) + + +def load_latest_checkpoint(checkpoint_dir, rank=0): + checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") + # Create checkpoint index file if not exist. + if (not os.path.isfile(checkpoint_path)) and rank == 0: + with open(checkpoint_path, "w") as handle: + handle.write("model_checkpoint_path: step-0") + + # Make sure that other process waits until checkpoint file is created + # by process 0. + while not os.path.isfile(checkpoint_path): + time.sleep(1) + + # Fetch the latest checkpoint index. + with open(checkpoint_path, "r") as handle: + latest_checkpoint = handle.readline().split()[-1] + iteration = int(latest_checkpoint.split("-")[-1]) + + return iteration + + +def save_latest_checkpoint(checkpoint_dir, iteration): + checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") + # Update the latest checkpoint index. + with open(checkpoint_path, "w") as handle: + handle.write("model_checkpoint_path: step-{}".format(iteration)) + + +def load_parameters(checkpoint_dir, rank, model, optimizer=None, + iteration=None, file_path=None): + if file_path is None: + if iteration is None: + iteration = load_latest_checkpoint(checkpoint_dir, rank) + if iteration == 0: + return + file_path = "{}/step-{}".format(checkpoint_dir, iteration) + + model_dict, optimizer_dict = dg.load_dygraph(file_path) + model.set_dict(model_dict) + print("[checkpoint] Rank {}: loaded model from {}".format(rank, file_path)) + if optimizer and optimizer_dict: + optimizer.set_dict(optimizer_dict) + print("[checkpoint] Rank {}: loaded optimizer state from {}".format( + rank, file_path)) + + +def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None): + file_path = "{}/step-{}".format(checkpoint_dir, iteration) + model_dict = model.state_dict() + dg.save_dygraph(model_dict, file_path) + print("[checkpoint] Saved model to {}".format(file_path)) + + if optimizer: + opt_dict = optimizer.state_dict() + dg.save_dygraph(opt_dict, file_path) + print("[checkpoint] Saved optimzier state to {}".format(file_path)) diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py new file mode 100644 index 0000000..4935d42 --- /dev/null +++ b/parakeet/models/waveflow/waveflow.py @@ -0,0 +1,190 @@ +import itertools +import os +import time + +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid +from scipy.io.wavfile import write + +import utils +from data import LJSpeech +from waveflow_modules import WaveFlowLoss, WaveFlowModule + + +class WaveFlow(): + def __init__(self, config, checkpoint_dir, parallel=False, rank=0, + nranks=1, tb_logger=None): + self.config = config + self.checkpoint_dir = checkpoint_dir + self.parallel = parallel + self.rank = rank + self.nranks = nranks + self.tb_logger = tb_logger + + def build(self, training=True): + config = self.config + dataset = LJSpeech(config, self.nranks, self.rank) + self.trainloader = dataset.trainloader + self.validloader = dataset.validloader + + waveflow = WaveFlowModule("waveflow", config) + + # Dry run once to create and initalize all necessary parameters. + audio = dg.to_variable(np.random.randn(1, 16000).astype(np.float32)) + mel = dg.to_variable( + np.random.randn(1, config.mel_bands, 63).astype(np.float32)) + waveflow(audio, mel) + + if training: + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=config.learning_rate) + + # Load parameters. + utils.load_parameters(self.checkpoint_dir, self.rank, + waveflow, optimizer, + iteration=config.iteration, + file_path=config.checkpoint) + print("Rank {}: checkpoint loaded.".format(self.rank)) + + # Data parallelism. + if self.parallel: + strategy = dg.parallel.prepare_context() + waveflow = dg.parallel.DataParallel(waveflow, strategy) + + self.waveflow = waveflow + self.optimizer = optimizer + self.criterion = WaveFlowLoss(config.sigma) + + else: + # Load parameters. + utils.load_parameters(self.checkpoint_dir, self.rank, waveflow, + iteration=config.iteration, + file_path=config.checkpoint) + print("Rank {}: checkpoint loaded.".format(self.rank)) + + self.waveflow = waveflow + + def train_step(self, iteration): + self.waveflow.train() + + start_time = time.time() + audios, mels = next(self.trainloader) + load_time = time.time() + + outputs = self.waveflow(audios, mels) + loss = self.criterion(outputs) + + if self.parallel: + # loss = loss / num_trainers + loss = self.waveflow.scale_loss(loss) + loss.backward() + self.waveflow.apply_collective_grads() + else: + loss.backward() + + self.optimizer.minimize(loss, parameter_list=self.waveflow.parameters()) + self.waveflow.clear_gradients() + + graph_time = time.time() + + if self.rank == 0: + loss_val = float(loss.numpy()) * self.nranks + log = "Rank: {} Step: {:^8d} Loss: {:<8.3f} " \ + "Time: {:.3f}/{:.3f}".format( + self.rank, iteration, loss_val, + load_time - start_time, graph_time - load_time) + print(log) + + tb = self.tb_logger + tb.add_scalar("Train-Loss-Rank-0", loss_val, iteration) + + @dg.no_grad + def valid_step(self, iteration): + self.waveflow.eval() + tb = self.tb_logger + + total_loss = [] + sample_audios = [] + start_time = time.time() + + for i, batch in enumerate(self.validloader()): + audios, mels = batch + valid_outputs = self.waveflow(audios, mels) + valid_z, valid_log_s_list = valid_outputs + + # Visualize latent z and scale log_s. + if self.rank == 0 and i == 0: + tb.add_histogram("Valid-Latent_z", valid_z.numpy(), iteration) + for j, valid_log_s in enumerate(valid_log_s_list): + hist_name = "Valid-{}th-Flow-Log_s".format(j) + tb.add_histogram(hist_name, valid_log_s.numpy(), iteration) + + valid_loss = self.criterion(valid_outputs) + total_loss.append(float(valid_loss.numpy())) + + total_time = time.time() - start_time + if self.rank == 0: + loss_val = np.mean(total_loss) + log = "Test | Rank: {} AvgLoss: {:<8.3f} Time {:<8.3f}".format( + self.rank, loss_val, total_time) + print(log) + tb.add_scalar("Valid-Avg-Loss", loss_val, iteration) + + @dg.no_grad + def infer(self, iteration): + self.waveflow.eval() + + config = self.config + sample = config.sample + + output = "{}/{}/iter-{}".format(config.output, config.name, iteration) + os.makedirs(output, exist_ok=True) + + mels_list = [mels for _, mels in self.validloader()] + if sample is not None: + mels_list = [mels_list[sample]] + + for sample, mel in enumerate(mels_list): + filename = "{}/valid_{}.wav".format(output, sample) + print("Synthesize sample {}, save as {}".format(sample, filename)) + + start_time = time.time() + audio = self.waveflow.synthesize(mel, sigma=self.config.sigma) + syn_time = time.time() - start_time + + audio = audio[0] + audio_time = audio.shape[0] / self.config.sample_rate + print("audio time {:.4f}, synthesis time {:.4f}".format( + audio_time, syn_time)) + + # Denormalize audio from [-1, 1] to [-32768, 32768] int16 range. + audio = audio.numpy() * 32768.0 + audio = audio.astype('int16') + write(filename, config.sample_rate, audio) + + @dg.no_grad + def benchmark(self): + self.waveflow.eval() + + mels_list = [mels for _, mels in self.validloader()] + mel = fluid.layers.concat(mels_list, axis=2) + mel = mel[:, :, :864] + batch_size = 8 + mel = fluid.layers.expand(mel, [batch_size, 1, 1]) + + for i in range(10): + start_time = time.time() + audio = self.waveflow.synthesize(mel, sigma=self.config.sigma) + print("audio.shape = ", audio.shape) + syn_time = time.time() - start_time + + audio_time = audio.shape[1] * batch_size / self.config.sample_rate + print("audio time {:.4f}, synthesis time {:.4f}".format( + audio_time, syn_time)) + print("{} X real-time".format(audio_time / syn_time)) + + def save(self, iteration): + utils.save_latest_parameters(self.checkpoint_dir, iteration, + self.waveflow, self.optimizer) + utils.save_latest_checkpoint(self.checkpoint_dir, iteration) diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py new file mode 100644 index 0000000..39cb598 --- /dev/null +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -0,0 +1,351 @@ +import itertools + +import numpy as np +import paddle.fluid.dygraph as dg +from paddle import fluid +from parakeet.modules import conv, modules, weight_norm + + +def set_param_attr(layer, c_in=1): + if isinstance(layer, (weight_norm.Conv2DTranspose, weight_norm.Conv2D)): + k = np.sqrt(1.0 / (c_in * np.prod(layer._filter_size))) + weight_init = fluid.initializer.UniformInitializer(low=-k, high=k) + bias_init = fluid.initializer.UniformInitializer(low=-k, high=k) + elif isinstance(layer, dg.Conv2D): + weight_init = fluid.initializer.ConstantInitializer(0.0) + bias_init = fluid.initializer.ConstantInitializer(0.0) + else: + raise TypeError("Unsupported layer type.") + + layer._param_attr = fluid.ParamAttr(initializer=weight_init) + layer._bias_attr = fluid.ParamAttr(initializer=bias_init) + + +def unfold(x, n_group): + length = x.shape[-1] + new_shape = x.shape[:-1] + [length // n_group, n_group] + return fluid.layers.reshape(x, new_shape) + + +class WaveFlowLoss: + def __init__(self, sigma=1.0): + self.sigma = sigma + + def __call__(self, model_output): + z, log_s_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = fluid.layers.reduce_sum(log_s) + else: + log_s_total = log_s_total + fluid.layers.reduce_sum(log_s) + + loss = fluid.layers.reduce_sum(z * z) / (2 * self.sigma * self.sigma) \ + - log_s_total + loss = loss / np.prod(z.shape) + const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma) + + return loss + const + + +class Conditioner(dg.Layer): + def __init__(self, name_scope): + super(Conditioner, self).__init__(name_scope) + upsample_factors = [16, 16] + + self.upsample_conv2d = [] + for s in upsample_factors: + in_channel = 1 + conv_trans2d = modules.Conv2DTranspose( + self.full_name(), + num_filters=1, + filter_size=(3, 2 * s), + padding=(1, s // 2), + stride=(1, s)) + set_param_attr(conv_trans2d, c_in=in_channel) + self.upsample_conv2d.append(conv_trans2d) + + for i, layer in enumerate(self.upsample_conv2d): + self.add_sublayer("conv2d_transpose_{}".format(i), layer) + + def forward(self, x): + x = fluid.layers.unsqueeze(x, 1) + for layer in self.upsample_conv2d: + x = fluid.layers.leaky_relu(layer(x), alpha=0.4) + + return fluid.layers.squeeze(x, [1]) + + def infer(self, x): + x = fluid.layers.unsqueeze(x, 1) + for layer in self.upsample_conv2d: + x = layer(x) + # Trim conv artifacts. + time_cutoff = layer._filter_size[1] - layer._stride[1] + x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4) + + return fluid.layers.squeeze(x, [1]) + + +class Flow(dg.Layer): + def __init__(self, name_scope, config): + super(Flow, self).__init__(name_scope) + self.n_layers = config.n_layers + self.n_channels = config.n_channels + self.kernel_h = config.kernel_h + self.kernel_w = config.kernel_w + + # Transform audio: [batch, 1, n_group, time/n_group] + # => [batch, n_channels, n_group, time/n_group] + self.start = weight_norm.Conv2D( + self.full_name(), + num_filters=self.n_channels, + filter_size=(1, 1)) + set_param_attr(self.start, c_in=1) + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + # output shape: [batch, 2, n_group, time/n_group] + self.end = dg.Conv2D( + self.full_name(), + num_filters=2, + filter_size=(1, 1)) + set_param_attr(self.end) + + # receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze + dilation_dict = {8: [1, 1, 1, 1, 1, 1, 1, 1], + 16: [1, 1, 1, 1, 1, 1, 1, 1], + 32: [1, 2, 4, 1, 2, 4, 1, 2], + 64: [1, 2, 4, 8, 16, 1, 2, 4], + 128: [1, 2, 4, 8, 16, 32, 64, 1]} + self.dilation_h_list = dilation_dict[config.n_group] + + self.in_layers = [] + self.cond_layers = [] + self.res_skip_layers = [] + for i in range(self.n_layers): + dilation_h = self.dilation_h_list[i] + dilation_w = 2 ** i + + in_layer = weight_norm.Conv2D( + self.full_name(), + num_filters=2 * self.n_channels, + filter_size=(self.kernel_h, self.kernel_w), + dilation=(dilation_h, dilation_w)) + set_param_attr(in_layer, c_in=self.n_channels) + self.in_layers.append(in_layer) + + cond_layer = weight_norm.Conv2D( + self.full_name(), + num_filters=2 * self.n_channels, + filter_size=(1, 1)) + set_param_attr(cond_layer, c_in=config.mel_bands) + self.cond_layers.append(cond_layer) + + if i < self.n_layers - 1: + res_skip_channels = 2 * self.n_channels + else: + res_skip_channels = self.n_channels + res_skip_layer = weight_norm.Conv2D( + self.full_name(), + num_filters=res_skip_channels, + filter_size=(1, 1)) + set_param_attr(res_skip_layer, c_in=self.n_channels) + self.res_skip_layers.append(res_skip_layer) + + self.add_sublayer("in_layer_{}".format(i), in_layer) + self.add_sublayer("cond_layer_{}".format(i), cond_layer) + self.add_sublayer("res_skip_layer_{}".format(i), res_skip_layer) + + def forward(self, audio, mel): + # audio: [bs, 1, n_group, time/group] + # mel: [bs, mel_bands, n_group, time/n_group] + audio = self.start(audio) + + for i in range(self.n_layers): + dilation_h = self.dilation_h_list[i] + dilation_w = 2 ** i + + # Pad height dim (n_group): causal convolution + # Pad width dim (time): dialated non-causal convolution + pad_top, pad_bottom = (self.kernel_h - 1) * dilation_h, 0 + pad_left = pad_right = int((self.kernel_w-1) * dilation_w / 2) + audio_pad = fluid.layers.pad2d(audio, + paddings=[pad_top, pad_bottom, pad_left, pad_right]) + + hidden = self.in_layers[i](audio_pad) + cond_hidden = self.cond_layers[i](mel) + in_acts = hidden + cond_hidden + out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \ + fluid.layers.sigmoid(in_acts[:, self.n_channels:, :]) + res_skip_acts = self.res_skip_layers[i](out_acts) + + if i < self.n_layers - 1: + audio += res_skip_acts[:, :self.n_channels, :, :] + skip_acts = res_skip_acts[:, self.n_channels:, :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output += skip_acts + + return self.end(output) + + def infer(self, audio, mel, queues): + audio = self.start(audio) + + for i in range(self.n_layers): + dilation_h = self.dilation_h_list[i] + dilation_w = 2 ** i + + state_size = dilation_h * (self.kernel_h - 1) + queue = queues[i] + + if len(queue) == 0: + for j in range(state_size): + queue.append(fluid.layers.zeros_like(audio)) + + state = queue[0:state_size] + state = fluid.layers.concat([*state, audio], axis=2) + + queue.pop(0) + queue.append(audio) + + # Pad height dim (n_group): causal convolution + # Pad width dim (time): dialated non-causal convolution + pad_top, pad_bottom = 0, 0 + pad_left = int((self.kernel_w-1) * dilation_w / 2) + pad_right = int((self.kernel_w-1) * dilation_w / 2) + state = fluid.layers.pad2d(state, + paddings=[pad_top, pad_bottom, pad_left, pad_right]) + + hidden = self.in_layers[i](state) + cond_hidden = self.cond_layers[i](mel) + in_acts = hidden + cond_hidden + out_acts = fluid.layers.tanh(in_acts[:, :self.n_channels, :]) * \ + fluid.layers.sigmoid(in_acts[:, self.n_channels:, :]) + res_skip_acts = self.res_skip_layers[i](out_acts) + + if i < self.n_layers - 1: + audio += res_skip_acts[:, :self.n_channels, :, :] + skip_acts = res_skip_acts[:, self.n_channels:, :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output += skip_acts + + return self.end(output) + + +class WaveFlowModule(dg.Layer): + def __init__(self, name_scope, config): + super(WaveFlowModule, self).__init__(name_scope) + self.n_flows = config.n_flows + self.n_group = config.n_group + self.n_layers = config.n_layers + assert self.n_group % 2 == 0 + assert self.n_flows % 2 == 0 + + self.conditioner = Conditioner(self.full_name()) + self.flows = [] + for i in range(self.n_flows): + flow = Flow(self.full_name(), config) + self.flows.append(flow) + self.add_sublayer("flow_{}".format(i), flow) + + self.perms = [] + half = self.n_group // 2 + for i in range(self.n_flows): + perm = list(range(self.n_group)) + if i < self.n_flows // 2: + perm = perm[::-1] + else: + perm[:half] = reversed(perm[:half]) + perm[half:] = reversed(perm[half:]) + self.perms.append(perm) + + def forward(self, audio, mel): + mel = self.conditioner(mel) + assert mel.shape[2] >= audio.shape[1] + # Prune out the tail of audio/mel so that time/n_group == 0. + pruned_len = audio.shape[1] // self.n_group * self.n_group + + if audio.shape[1] > pruned_len: + audio = audio[:, :pruned_len] + if mel.shape[2] > pruned_len: + mel = mel[:, :, :pruned_len] + + # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] + mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) + # From [bs, time] to [bs, n_group, time/n_group] + audio = fluid.layers.transpose(unfold(audio, self.n_group), [0, 2, 1]) + # [bs, 1, n_group, time/n_group] + audio = fluid.layers.unsqueeze(audio, 1) + + log_s_list = [] + for i in range(self.n_flows): + inputs = audio[:, :, :-1, :] + conds = mel[:, :, 1:, :] + outputs = self.flows[i](inputs, conds) + log_s = outputs[:, :1, :, :] + b = outputs[:, 1:, :, :] + log_s_list.append(log_s) + + audio_0 = audio[:, :, :1, :] + audio_out = audio[:, :, 1:, :] * fluid.layers.exp(log_s) + b + audio = fluid.layers.concat([audio_0, audio_out], axis=2) + + # Permute over the height dim. + audio_slices = [audio[:, :, j, :] for j in self.perms[i]] + audio = fluid.layers.stack(audio_slices, axis=2) + mel_slices = [mel[:, :, j, :] for j in self.perms[i]] + mel = fluid.layers.stack(mel_slices, axis=2) + + z = fluid.layers.squeeze(audio, [1]) + + return z, log_s_list + + def synthesize(self, mel, sigma=1.0): + mel = self.conditioner.infer(mel) + # From [bs, mel_bands, time] to [bs, mel_bands, n_group, time/n_group] + mel = fluid.layers.transpose(unfold(mel, self.n_group), [0, 1, 3, 2]) + + audio = fluid.layers.gaussian_random( + shape=[mel.shape[0], 1, mel.shape[2], mel.shape[3]], std=sigma) + + for i in reversed(range(self.n_flows)): + # Permute over the height dimension. + audio_slices = [audio[:, :, j, :] for j in self.perms[i]] + audio = fluid.layers.stack(audio_slices, axis=2) + mel_slices = [mel[:, :, j, :] for j in self.perms[i]] + mel = fluid.layers.stack(mel_slices, axis=2) + + audio_list = [] + audio_0 = audio[:, :, 0:1, :] + audio_list.append(audio_0) + audio_h = audio_0 + queues = [[] for _ in range(self.n_layers)] + + for h in range(1, self.n_group): + inputs = audio_h + conds = mel[:, :, h:(h+1), :] + outputs = self.flows[i].infer(inputs, conds, queues) + + log_s = outputs[:, 0:1, :, :] + b = outputs[:, 1:, :, :] + audio_h = (audio[:, :, h:(h+1), :] - b) / \ + fluid.layers.exp(log_s) + audio_list.append(audio_h) + + audio = fluid.layers.concat(audio_list, axis=2) + + # audio: [bs, n_group, time/n_group] + audio = fluid.layers.squeeze(audio, [1]) + # audio: [bs, time] + audio = fluid.layers.reshape( + fluid.layers.transpose(audio, [0, 2, 1]), [audio.shape[0], -1]) + + return audio diff --git a/parakeet/modules/dynamicGRU.py b/parakeet/modules/dynamicGRU.py new file mode 100644 index 0000000..e84c598 --- /dev/null +++ b/parakeet/modules/dynamicGRU.py @@ -0,0 +1,52 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class DynamicGRU(dg.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = dg.GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + """ + Dynamic GRU block. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result compute by GRU. + """ + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = layers.concat(res, axis=1) + return res + diff --git a/parakeet/modules/feed_forward.py b/parakeet/modules/feed_forward.py new file mode 100644 index 0000000..452c482 --- /dev/null +++ b/parakeet/modules/feed_forward.py @@ -0,0 +1,52 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers +import paddle.fluid as fluid +import math +from parakeet.modules.layers import Conv + + +class PositionwiseFeedForward(dg.Layer): + ''' A two-feed-forward-layer module ''' + def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.num_hidden = num_hidden + self.use_cudnn = use_cudnn + self.dropout = dropout + + self.w_1 = Conv(in_channels = d_in, + out_channels = num_hidden, + filter_size = filter_size, + padding=padding, + use_cudnn = use_cudnn, + data_format = "NTC") + + self.w_2 = Conv(in_channels = num_hidden, + out_channels = d_in, + filter_size = filter_size, + padding=padding, + use_cudnn = use_cudnn, + data_format = "NTC") + self.layer_norm = dg.LayerNorm(d_in) + + def forward(self, input): + """ + Feed Forward Network. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result after FFN. + """ + #FFN Networt + x = self.w_2(layers.relu(self.w_1(input))) + + # dropout + x = layers.dropout(x, self.dropout) + + # residual connection + x = x + input + + #layer normalization + output = self.layer_norm(x) + + return output \ No newline at end of file diff --git a/parakeet/modules/layers.py b/parakeet/modules/layers.py new file mode 100644 index 0000000..29a10db --- /dev/null +++ b/parakeet/modules/layers.py @@ -0,0 +1,158 @@ +import math +import numpy as np + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + + +class Conv(dg.Layer): + def __init__(self, in_channels, out_channels, filter_size=1, + padding=0, dilation=1, stride=1, use_cudnn=True, + data_format="NCT", is_bias=True): + super(Conv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_size = filter_size + self.padding = padding + self.dilation = dilation + self.stride = stride + self.use_cudnn = use_cudnn + self.data_format = data_format + self.is_bias = is_bias + + self.weight_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()) + self.bias_attr = None + if is_bias is not False: + k = math.sqrt(1 / in_channels) + self.bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)) + + self.conv = Conv1D( in_channels = in_channels, + out_channels = out_channels, + filter_size = filter_size, + padding = padding, + dilation = dilation, + stride = stride, + param_attr = self.weight_attr, + bias_attr = self.bias_attr, + use_cudnn = use_cudnn, + data_format = data_format) + + def forward(self, x): + x = self.conv(x) + return x + +class Conv1D(dg.Layer): + """ + A convolution 1D block implemented with Conv2D. Form simplicity and + ensuring the output has the same length as the input, it does not allow + stride > 1. + """ + + def __init__(self, + in_channels, + out_channels, + filter_size=3, + padding=0, + dilation=1, + stride=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + data_format='NCT', + dtype="float32"): + super(Conv1D, self).__init__(dtype=dtype) + + self.padding = padding + self.in_channels = in_channels + self.num_filters = out_channels + self.filter_size = filter_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.act = act + self.data_format = data_format + + self.conv = dg.Conv2D( + num_channels=in_channels, + num_filters=out_channels, + filter_size=(1, filter_size), + stride=(1, stride), + dilation=(1, dilation), + padding=(0, padding), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.conv(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x + +class Pool1D(dg.Layer): + """ + A Pool 1D block implemented with Pool2D. + """ + def __init__(self, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, + exclusive=True, + data_format='NCT'): + super(Pool1D, self).__init__() + self.pool_size = pool_size + self.pool_type = pool_type + self.pool_stride = pool_stride + self.pool_padding = pool_padding + self.global_pooling = global_pooling + self.use_cudnn = use_cudnn + self.ceil_mode = ceil_mode + self.exclusive = exclusive + self.data_format = data_format + + + self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, + pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], + global_pooling = global_pooling, use_cudnn = use_cudnn, + ceil_mode = ceil_mode, exclusive = exclusive) + + + def forward(self, x): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input, where C_in means + input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means + output channels (num_filters). + """ + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + x = fluid.layers.unsqueeze(x, [2]) + x = self.pool2d(x) + x = fluid.layers.squeeze(x, [2]) + if self.data_format == 'NTC': + x = fluid.layers.transpose(x, [0, 2, 1]) + return x diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py new file mode 100644 index 0000000..b2592bb --- /dev/null +++ b/parakeet/modules/multihead_attention.py @@ -0,0 +1,112 @@ +import math +import numpy as np +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class ScaledDotProductAttention(dg.Layer): + def __init__(self, d_key): + super(ScaledDotProductAttention, self).__init__() + + self.d_key = d_key + + # please attention this mask is diff from pytorch + def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1): + """ + Scaled Dot Product Attention. + + Args: + key (Variable): Shape(B, T, C), dtype: float32. The input key of attention. + value (Variable): Shape(B, T, C), dtype: float32. The input value of attention. + query (Variable): Shape(B, T, C), dtype: float32. The input query of attention. + mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key. + query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query. + dropout (Constant): dtype: float32. The probability of dropout. + Returns: + result (Variable), Shape(B, T, C), the result of mutihead attention. + attention (Variable), Shape(n_head * B, T, C), the attention of key. + """ + # Compute attention score + attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y + attention = attention / math.sqrt(self.d_key) + + # Mask key to ignore padding + if mask is not None: + attention = attention * mask + mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) + attention = attention + mask + + + attention = layers.softmax(attention) + attention = layers.dropout(attention, dropout) + # Mask query to ignore padding + if query_mask is not None: + attention = attention * query_mask + + result = layers.matmul(attention, value) + return result, attention + +class MultiheadAttention(dg.Layer): + def __init__(self, num_hidden, d_k, d_q, num_head=4, dropout=0.1): + super(MultiheadAttention, self).__init__() + self.num_hidden = num_hidden + self.num_head = num_head + self.d_k = d_k + self.d_q = d_q + self.dropout = dropout + + self.key = dg.Linear(num_hidden, num_head * d_k) + self.value = dg.Linear(num_hidden, num_head * d_k) + self.query = dg.Linear(num_hidden, num_head * d_q) + + self.scal_attn = ScaledDotProductAttention(d_k) + + self.fc = dg.Linear(num_head * d_q, num_hidden) + + self.layer_norm = dg.LayerNorm(num_hidden) + + def forward(self, key, value, query_input, mask=None, query_mask=None): + """ + Multihead Attention. + + Args: + key (Variable): Shape(B, T, C), dtype: float32. The input key of attention. + value (Variable): Shape(B, T, C), dtype: float32. The input value of attention. + query_input (Variable): Shape(B, T, C), dtype: float32. The input query of attention. + mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key. + query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query. + Returns: + result (Variable), Shape(B, T, C), the result of mutihead attention. + attention (Variable), Shape(n_head * B, T, C), the attention of key. + """ + batch_size = key.shape[0] + seq_len_key = key.shape[1] + seq_len_query = query_input.shape[1] + + # repeat masks h times + if query_mask is not None: + query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) + if mask is not None: + mask = layers.expand(mask, (self.num_head, 1, 1)) + + + # Make multihead attention + # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) + key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) + value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k]) + query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q]) + + key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q]) + + result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) + + # concat all multihead result + result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) + result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + + result = layers.dropout(self.fc(result), self.dropout) + result = result + query_input + + result = self.layer_norm(result) + return result, attention \ No newline at end of file diff --git a/parakeet/modules/post_convnet.py b/parakeet/modules/post_convnet.py new file mode 100644 index 0000000..559d70e --- /dev/null +++ b/parakeet/modules/post_convnet.py @@ -0,0 +1,75 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from parakeet.modules.layers import Conv + +class PostConvNet(dg.Layer): + def __init__(self, + n_mels=80, + num_hidden=512, + filter_size=5, + padding=0, + num_conv=5, + outputs_per_step=1, + use_cudnn=True, + dropout=0.1): + super(PostConvNet, self).__init__() + + self.dropout = dropout + self.conv_list = [] + self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT")) + + for _ in range(1, num_conv-1): + self.conv_list.append(Conv(in_channels = num_hidden, + out_channels = num_hidden, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT") ) + + self.conv_list.append(Conv(in_channels = num_hidden, + out_channels = n_mels * outputs_per_step, + filter_size = filter_size, + padding = padding, + use_cudnn = use_cudnn, + data_format = "NCT")) + + for i, layer in enumerate(self.conv_list): + self.add_sublayer("conv_list_{}".format(i), layer) + + self.batch_norm_list = [dg.BatchNorm(num_hidden, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW') for _ in range(num_conv-1)] + self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, + param_attr = fluid.ParamAttr(name='weight'), + bias_attr = fluid.ParamAttr(name='bias'), + moving_mean_name = 'moving_mean', + moving_variance_name = 'moving_var', + data_layout='NCHW')) + for i, layer in enumerate(self.batch_norm_list): + self.add_sublayer("batch_norm_list_{}".format(i), layer) + + + def forward(self, input): + """ + Post Conv Net. + + Args: + input (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + output (Variable), Shape(B, T, C), the result after postconvnet. + """ + input = layers.transpose(input, [0,2,1]) + len = input.shape[-1] + for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): + input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) + output = layers.transpose(input, [0,2,1]) + return output \ No newline at end of file diff --git a/parakeet/modules/prenet.py b/parakeet/modules/prenet.py new file mode 100644 index 0000000..4ea50e1 --- /dev/null +++ b/parakeet/modules/prenet.py @@ -0,0 +1,31 @@ +import paddle.fluid.dygraph as dg +import paddle.fluid.layers as layers + +class PreNet(dg.Layer): + def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): + """ + :param input_size: dimension of input + :param hidden_size: dimension of hidden unit + :param output_size: dimension of output + """ + super(PreNet, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.dropout_rate = dropout_rate + + self.linear1 = dg.Linear(input_size, hidden_size) + self.linear2 = dg.Linear(hidden_size, output_size) + + def forward(self, x): + """ + Pre Net before passing through the network. + + Args: + x (Variable): Shape(B, T, C), dtype: float32. The input value. + Returns: + x (Variable), Shape(B, T, C), the result after pernet. + """ + x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) + x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) + return x diff --git a/parakeet/modules/utils.py b/parakeet/modules/utils.py new file mode 100644 index 0000000..626d5f2 --- /dev/null +++ b/parakeet/modules/utils.py @@ -0,0 +1,73 @@ +import numpy as np +import librosa +import os, copy +from scipy import signal +import paddle.fluid.layers as layers + + +def get_positional_table(d_pos_vec, n_position=1024): + position_enc = np.array([ + [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)] + if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc + +def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): + ''' Sinusoid position encoding table ''' + + def cal_angle(position, hid_idx): + return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) + + def get_posi_angle_vec(position): + return [cal_angle(position, hid_j) for hid_j in range(d_hid)] + + sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) + + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + + if padding_idx is not None: + # zero vector for padding dimension + sinusoid_table[padding_idx] = 0. + + return sinusoid_table + +def get_non_pad_mask(seq): + return layers.unsqueeze((seq != 0).astype(np.float32),[-1]) + +def get_attn_key_pad_mask(seq_k, seq_q): + ''' For masking out the padding part of key sequence. ''' + + # Expand to fit the shape of key query attention matrix. + len_q = seq_q.shape[1] + padding_mask = (seq_k != 0).astype(np.float32) + padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) + return padding_mask + +def get_triu_tensor(seq_k, seq_q): + ''' For make a triu tensor ''' + len_k = seq_k.shape[1] + len_q = seq_q.shape[1] + batch_size = seq_k.shape[0] + triu_tensor = np.triu(np.ones([len_k, len_q]), 1) + triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0) + + return triu_tensor + +def guided_attention(N, T, g=0.2): + '''Guided attention. Refer to page 3 on the paper.''' + W = np.zeros((N, T), dtype=np.float32) + for n_pos in range(W.shape[0]): + for t_pos in range(W.shape[1]): + W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) + return W + + +def cross_entropy(input, label, position_weight=5.0, epsilon=0.0001): + input = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) + label = input * (label * (position_weight - 1) + 1) + return layers.reduce_sum(label, dim=[0, 1]) + + diff --git a/tests/test_ljspeech.py b/tests/test_ljspeech.py index 04db6a9..34f5011 100644 --- a/tests/test_ljspeech.py +++ b/tests/test_ljspeech.py @@ -7,4 +7,4 @@ LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1") ljspeech = LJSpeech(LJSPEECH_ROOT) ljspeech_cargo = DataCargo(ljspeech, batch_size=16, shuffle=True) for i, batch in enumerate(ljspeech_cargo): - print(i) \ No newline at end of file + print(i)