diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4102b69..9d6da44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,3 +25,11 @@ files: \.md$ - id: remove-tabs files: \.md$ +- repo: local + hooks: + - id: copyright_checker + name: copyright_checker + entry: python ./tools/copyright.hook + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ + exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ diff --git a/examples/deepvoice3/README.md b/examples/deepvoice3/README.md index 43e1939..0138414 100644 --- a/examples/deepvoice3/README.md +++ b/examples/deepvoice3/README.md @@ -1,4 +1,4 @@ -# Deepvoice 3 +# Deepvoice 3 Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654). @@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed ## Project Structure ```text -├── data.py data_processing +├── data.py data_processing ├── ljspeech.yaml (example) configuration file ├── sentences.txt sample sentences ├── synthesis.py script to synthesize waveform from text @@ -50,7 +50,7 @@ optional arguments: The directory to save result. -g DEVICE, --device DEVICE device to use -``` +``` 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config. 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt). @@ -61,7 +61,7 @@ optional arguments: ├── checkpoints # checkpoint ├── log # tensorboard log └── states # train and evaluation results - ├── alignments # attention + ├── alignments # attention ├── lin_spec # linear spectrogram ├── mel_spec # mel spectrogram └── waveform # waveform (.wav files) @@ -112,4 +112,3 @@ example script: ```bash python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated ``` - diff --git a/examples/deepvoice3/data.py b/examples/deepvoice3/data.py index 8f6b2ce..68f54cd 100644 --- a/examples/deepvoice3/data.py +++ b/examples/deepvoice3/data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import csv from pathlib import Path @@ -79,10 +93,11 @@ class Transform(object): y = signal.lfilter([1., -self.preemphasis], [1.], wav) # STFT - D = librosa.stft(y=y, - n_fft=self.n_fft, - win_length=self.win_length, - hop_length=self.hop_length) + D = librosa.stft( + y=y, + n_fft=self.n_fft, + win_length=self.win_length, + hop_length=self.hop_length) S = np.abs(D) # to db and normalize to 0-1 @@ -96,11 +111,8 @@ class Transform(object): # mel scale and to db and normalize to 0-1, # CAUTION: pass linear scale S, not dbscaled S - S_mel = librosa.feature.melspectrogram(S=S, - n_mels=self.n_mels, - fmin=self.fmin, - fmax=self.fmax, - power=1.) + S_mel = librosa.feature.melspectrogram( + S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.) S_mel = 20 * np.log10(np.maximum(amplitude_min, S_mel)) - self.ref_level_db S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db) @@ -148,20 +160,18 @@ class DataCollector(object): (mix_grapheme_phonemes, text_length, speaker_id, S_norm, S_mel_norm, num_frames) = example text_sequences.append( - np.pad(mix_grapheme_phonemes, - (0, max_text_length - text_length))) + np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length + ))) lin_specs.append( - np.pad(S_norm, - ((0, 0), (self._pad_begin, - max_frames - self._pad_begin - num_frames)))) + np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames - + self._pad_begin - num_frames)))) mel_specs.append( - np.pad(S_mel_norm, - ((0, 0), (self._pad_begin, - max_frames - self._pad_begin - num_frames)))) + np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames - + self._pad_begin - num_frames)))) done_flags.append( np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )), - (0, max_decoder_length - - int(np.ceil(num_frames // self._factor))), + (0, max_decoder_length - int( + np.ceil(num_frames // self._factor))), constant_values=1)) text_sequences = np.array(text_sequences).astype(np.int64) lin_specs = np.transpose(np.array(lin_specs), diff --git a/examples/deepvoice3/synthesis.py b/examples/deepvoice3/synthesis.py index 303c182..5162e07 100644 --- a/examples/deepvoice3/synthesis.py +++ b/examples/deepvoice3/synthesis.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import argparse import ruamel.yaml @@ -22,11 +36,8 @@ if __name__ == "__main__": parser.add_argument("checkpoint", type=str, help="checkpoint to load.") parser.add_argument("text", type=str, help="text file to synthesize") parser.add_argument("output_path", type=str, help="path to save results") - parser.add_argument("-g", - "--device", - type=int, - default=-1, - help="device to use") + parser.add_argument( + "-g", "--device", type=int, default=-1, help="device to use") args = parser.parse_args() with open(args.config, 'rt') as f: @@ -76,15 +87,14 @@ if __name__ == "__main__": window_ahead = model_config["window_ahead"] key_projection = model_config["key_projection"] value_projection = model_config["value_projection"] - dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, - padding_idx, embedding_std, max_positions, n_vocab, - freeze_embedding, filter_size, encoder_channels, - n_mels, decoder_channels, r, - trainable_positional_encodings, use_memory_mask, - query_position_rate, key_position_rate, - window_backward, window_ahead, key_projection, - value_projection, downsample_factor, linear_dim, - use_decoder_states, converter_channels, dropout) + dv3 = make_model( + n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx, + embedding_std, max_positions, n_vocab, freeze_embedding, + filter_size, encoder_channels, n_mels, decoder_channels, r, + trainable_positional_encodings, use_memory_mask, + query_position_rate, key_position_rate, window_backward, + window_ahead, key_projection, value_projection, downsample_factor, + linear_dim, use_decoder_states, converter_channels, dropout) summary(dv3) state, _ = dg.load_dygraph(args.checkpoint) diff --git a/examples/deepvoice3/train.py b/examples/deepvoice3/train.py index 6d9aef6..ad42822 100644 --- a/examples/deepvoice3/train.py +++ b/examples/deepvoice3/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import argparse import ruamel.yaml diff --git a/examples/deepvoice3/utils.py b/examples/deepvoice3/utils.py index 02118af..756d008 100644 --- a/examples/deepvoice3/utils.py +++ b/examples/deepvoice3/utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import numpy as np from matplotlib import cm @@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, converter_channels, dropout): """just a simple function to create a deepvoice 3 model""" if n_speakers > 1: - spe = dg.Embedding((n_speakers, speaker_dim), - param_attr=I.Normal(scale=speaker_embed_std)) + spe = dg.Embedding( + (n_speakers, speaker_dim), + param_attr=I.Normal(scale=speaker_embed_std)) else: spe = None @@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), - ConvSpec(h, k, 3), - ) - enc = Encoder(n_vocab, - embed_dim, - n_speakers, - speaker_dim, - padding_idx=None, - embedding_weight_std=embedding_std, - convolutions=encoder_convolutions, - max_positions=max_positions, - dropout=dropout) + ConvSpec(h, k, 3), ) + enc = Encoder( + n_vocab, + embed_dim, + n_speakers, + speaker_dim, + padding_idx=None, + embedding_weight_std=embedding_std, + convolutions=encoder_convolutions, + max_positions=max_positions, + dropout=dropout) if freeze_embedding: freeze(enc.embed) @@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), - ConvSpec(h, k, 1), - ) + ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] - dec = Decoder(n_speakers, - speaker_dim, - embed_dim, - mel_dim, - r=r, - max_positions=max_positions, - padding_idx=padding_idx, - preattention=prenet_convolutions, - convolutions=attentive_convolutions, - attention=attention, - dropout=dropout, - use_memory_mask=use_memory_mask, - force_monotonic_attention=force_monotonic_attention, - query_position_rate=query_position_rate, - key_position_rate=key_position_rate, - window_range=WindowRange(window_behind, window_ahead), - key_projection=key_projection, - value_projection=value_projection) + dec = Decoder( + n_speakers, + speaker_dim, + embed_dim, + mel_dim, + r=r, + max_positions=max_positions, + padding_idx=padding_idx, + preattention=prenet_convolutions, + convolutions=attentive_convolutions, + attention=attention, + dropout=dropout, + use_memory_mask=use_memory_mask, + force_monotonic_attention=force_monotonic_attention, + query_position_rate=query_position_rate, + key_position_rate=key_position_rate, + window_range=WindowRange(window_behind, window_ahead), + key_projection=key_projection, + value_projection=value_projection) if not trainable_positional_encodings: freeze(dec.embed_keys_positions) freeze(dec.embed_query_positions) @@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), - ConvSpec(2 * h, k, 3), - ) - cvt = Converter(n_speakers, - speaker_dim, - dec.state_dim if use_decoder_states else mel_dim, - linear_dim, - time_upsampling=downsample_factor, - convolutions=postnet_convolutions, - dropout=dropout) + ConvSpec(2 * h, k, 3), ) + cvt = Converter( + n_speakers, + speaker_dim, + dec.state_dim if use_decoder_states else mel_dim, + linear_dim, + time_upsampling=downsample_factor, + convolutions=postnet_convolutions, + dropout=dropout) dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states) return dv3 @@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db, ref_level_db, power, n_iter, win_length, hop_length, preemphasis): """generate waveform from text using a deepvoice 3 model""" - text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob), - dtype=np.int64) + text = np.array( + en.text_to_sequence( + text, p=replace_pronounciation_prob), + dtype=np.int64) length = len(text) print("text sequence's length: {}".format(length)) text_positions = np.arange(1, 1 + length) @@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter, """ denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10)) - wav = librosa.griffinlim(lin_scaled**power, - n_iter=n_iter, - hop_length=hop_length, - win_length=win_length) + wav = librosa.griffinlim( + lin_scaled**power, + n_iter=n_iter, + hop_length=hop_length, + win_length=win_length) if preemphasis > 0: wav = signal.lfilter([1.], [1., -preemphasis], wav) return wav @@ -225,28 +243,30 @@ def save_state(save_dir, plt.colorbar() plt.title("mel_input") plt.savefig( - os.path.join(path, - "target_mel_spec_step{:09d}.png".format(global_step))) + os.path.join(path, "target_mel_spec_step{:09d}.png".format( + global_step))) plt.close() - writer.add_image("target/mel_spec", - cm.viridis(mel_input), - global_step, - dataformats="HWC") + writer.add_image( + "target/mel_spec", + cm.viridis(mel_input), + global_step, + dataformats="HWC") plt.figure(figsize=(10, 3)) display.specshow(mel_output) plt.colorbar() plt.title("mel_output") plt.savefig( - os.path.join( - path, "predicted_mel_spec_step{:09d}.png".format(global_step))) + os.path.join(path, "predicted_mel_spec_step{:09d}.png".format( + global_step))) plt.close() - writer.add_image("predicted/mel_spec", - cm.viridis(mel_output), - global_step, - dataformats="HWC") + writer.add_image( + "predicted/mel_spec", + cm.viridis(mel_output), + global_step, + dataformats="HWC") if lin_input is not None and lin_output is not None: lin_input = lin_input[0].numpy().T @@ -258,28 +278,30 @@ def save_state(save_dir, plt.colorbar() plt.title("mel_input") plt.savefig( - os.path.join(path, - "target_lin_spec_step{:09d}.png".format(global_step))) + os.path.join(path, "target_lin_spec_step{:09d}.png".format( + global_step))) plt.close() - writer.add_image("target/lin_spec", - cm.viridis(lin_input), - global_step, - dataformats="HWC") + writer.add_image( + "target/lin_spec", + cm.viridis(lin_input), + global_step, + dataformats="HWC") plt.figure(figsize=(10, 3)) display.specshow(lin_output) plt.colorbar() plt.title("mel_input") plt.savefig( - os.path.join( - path, "predicted_lin_spec_step{:09d}.png".format(global_step))) + os.path.join(path, "predicted_lin_spec_step{:09d}.png".format( + global_step))) plt.close() - writer.add_image("predicted/lin_spec", - cm.viridis(lin_output), - global_step, - dataformats="HWC") + writer.add_image( + "predicted/lin_spec", + cm.viridis(lin_output), + global_step, + dataformats="HWC") if alignments is not None and len(alignments.shape) == 4: path = os.path.join(save_dir, "alignments") @@ -290,10 +312,11 @@ def save_state(save_dir, "train_attn_layer_{}_step_{}.png".format(idx, global_step)) plot_alignment(attn_layer, save_path) - writer.add_image("train_attn/layer_{}".format(idx), - cm.viridis(attn_layer), - global_step, - dataformats="HWC") + writer.add_image( + "train_attn/layer_{}".format(idx), + cm.viridis(attn_layer), + global_step, + dataformats="HWC") if lin_output is not None: wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power, @@ -302,7 +325,5 @@ def save_state(save_dir, save_path = os.path.join( path, "train_sample_step_{:09d}.wav".format(global_step)) sf.write(save_path, wav, sample_rate) - writer.add_audio("train_sample", - wav, - global_step, - sample_rate=sample_rate) + writer.add_audio( + "train_sample", wav, global_step, sample_rate=sample_rate) diff --git a/examples/fastspeech/README.md b/examples/fastspeech/README.md index 007b6b2..1199b8b 100644 --- a/examples/fastspeech/README.md +++ b/examples/fastspeech/README.md @@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step`` -For more help on arguments: +For more help on arguments: ``python train.py --help``. ## Synthesis @@ -75,5 +75,5 @@ or you can run the script file directly. sh synthesis.sh ``` -For more help on arguments: +For more help on arguments: ``python synthesis.py --help``. diff --git a/examples/fastspeech/parse.py b/examples/fastspeech/parse.py index a6c2d99..690f4b2 100644 --- a/examples/fastspeech/parse.py +++ b/examples/fastspeech/parse.py @@ -1,36 +1,90 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse + def add_config_options_to_parser(parser): - parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml', + parser.add_argument( + '--config_path', + type=str, + default='config/fastspeech.yaml', help="the yaml config file path.") - parser.add_argument('--batch_size', type=int, default=32, - help="batch size for training.") - parser.add_argument('--epochs', type=int, default=10000, + parser.add_argument( + '--batch_size', type=int, default=32, help="batch size for training.") + parser.add_argument( + '--epochs', + type=int, + default=10000, help="the number of epoch for training.") - parser.add_argument('--lr', type=float, default=0.001, + parser.add_argument( + '--lr', + type=float, + default=0.001, help="the learning rate for training.") - parser.add_argument('--save_step', type=int, default=500, + parser.add_argument( + '--save_step', + type=int, + default=500, help="checkpointing interval during training.") - parser.add_argument('--fastspeech_step', type=int, default=70000, + parser.add_argument( + '--fastspeech_step', + type=int, + default=70000, help="Global step to restore checkpoint of fastspeech.") - parser.add_argument('--use_gpu', type=int, default=1, + parser.add_argument( + '--use_gpu', + type=int, + default=1, help="use gpu or not during training.") - parser.add_argument('--use_data_parallel', type=int, default=0, + parser.add_argument( + '--use_data_parallel', + type=int, + default=0, help="use data parallel or not during training.") - parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + parser.add_argument( + '--data_path', + type=str, + default='./dataset/LJSpeech-1.1', help="the path of dataset.") - parser.add_argument('--checkpoint_path', type=str, default=None, + parser.add_argument( + '--checkpoint_path', + type=str, + default=None, help="the path to load checkpoint or pretrain model.") - parser.add_argument('--save_path', type=str, default='./checkpoint', + parser.add_argument( + '--save_path', + type=str, + default='./checkpoint', help="the path to save checkpoint.") - parser.add_argument('--log_dir', type=str, default='./log', + parser.add_argument( + '--log_dir', + type=str, + default='./log', help="the directory to save tensorboard log.") - parser.add_argument('--sample_path', type=str, default='./sample', + parser.add_argument( + '--sample_path', + type=str, + default='./sample', help="the directory to save audio sample in synthesis.") - parser.add_argument('--transtts_path', type=str, default='./log', + parser.add_argument( + '--transtts_path', + type=str, + default='./log', help="the directory to load pretrain transformerTTS model.") - parser.add_argument('--transformer_step', type=int, default=160000, + parser.add_argument( + '--transformer_step', + type=int, + default=160000, help="the step to load transformerTTS model.") - - diff --git a/examples/fastspeech/synthesis.py b/examples/fastspeech/synthesis.py index 6a3d146..802d4e4 100644 --- a/examples/fastspeech/synthesis.py +++ b/examples/fastspeech/synthesis.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from tensorboardX import SummaryWriter from collections import OrderedDict @@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence from parakeet import audio from parakeet.models.fastspeech.fastspeech import FastSpeech + def load_checkpoint(step, model_path): model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) new_state_dict = OrderedDict() @@ -22,13 +36,14 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict + def synthesis(text_input, args): place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) # tensorboard if not os.path.exists(args.log_dir): - os.mkdir(args.log_dir) - path = os.path.join(args.log_dir,'synthesis') + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir, 'synthesis') with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) @@ -37,24 +52,28 @@ def synthesis(text_input, args): with dg.guard(place): model = FastSpeech(cfg) - model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))) + model.set_dict( + load_checkpoint( + str(args.fastspeech_step), + os.path.join(args.checkpoint_path, "fastspeech"))) model.eval() text = np.asarray(text_to_sequence(text_input)) - text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) - pos_text = np.arange(1, text.shape[1]+1) - pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) + text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) + pos_text = np.arange(1, text.shape[1] + 1) + pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) - mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha) + mel_output, mel_output_postnet = model( + text, pos_text, alpha=args.alpha) _ljspeech_processor = audio.AudioProcessor( - sample_rate=cfg['audio']['sr'], - num_mels=cfg['audio']['num_mels'], - min_level_db=cfg['audio']['min_level_db'], - ref_level_db=cfg['audio']['ref_level_db'], - n_fft=cfg['audio']['n_fft'], - win_length= cfg['audio']['win_length'], - hop_length= cfg['audio']['hop_length'], + sample_rate=cfg['audio']['sr'], + num_mels=cfg['audio']['num_mels'], + min_level_db=cfg['audio']['min_level_db'], + ref_level_db=cfg['audio']['ref_level_db'], + n_fft=cfg['audio']['n_fft'], + win_length=cfg['audio']['win_length'], + hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, @@ -67,14 +86,17 @@ def synthesis(text_input, args): do_trim_silence=False, sound_norm=False) - mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) - wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) + mel_output_postnet = fluid.layers.transpose( + fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0]) + wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy( + )) writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) print("Synthesis completed !!!") writer.close() + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train Fastspeech model") add_config_options_to_parser(parser) args = parser.parse_args() - synthesis("Transformer model is so fast!", args) \ No newline at end of file + synthesis("Transformer model is so fast!", args) diff --git a/examples/fastspeech/train.py b/examples/fastspeech/train.py index 52b5725..f1b59a2 100644 --- a/examples/fastspeech/train.py +++ b/examples/fastspeech/train.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np import argparse import os @@ -20,8 +33,10 @@ import sys sys.path.append("../transformer_tts") from data import LJSpeechLoader + def load_checkpoint(step, model_path): - model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + model_dict, opti_dict = fluid.dygraph.load_dygraph( + os.path.join(model_path, step)) new_state_dict = OrderedDict() for param in model_dict: if param.startswith('_layers.'): @@ -30,6 +45,7 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict, opti_dict + def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 @@ -43,26 +59,33 @@ def main(args): if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): - os.mkdir(args.log_dir) - path = os.path.join(args.log_dir,'fastspeech') + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir, 'fastspeech') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): with fluid.unique_name.guard(): transformerTTS = TransformerTTS(cfg) - model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) + model_dict, _ = load_checkpoint( + str(args.transformer_step), + os.path.join(args.transtts_path, "transformer")) transformerTTS.set_dict(model_dict) transformerTTS.eval() model = FastSpeech(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), - parameter_list=model.parameters()) - reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() - + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=dg.NoamDecay(1 / ( + cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), + parameter_list=model.parameters()) + reader = LJSpeechLoader( + cfg, args, nranks, local_rank, shuffle=True).reader() + if args.checkpoint_path is not None: - model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) + model_dict, opti_dict = load_checkpoint( + str(args.fastspeech_step), + os.path.join(args.checkpoint_path, "fastspeech")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.fastspeech_step @@ -76,31 +99,42 @@ def main(args): pbar = tqdm(reader) for i, data in enumerate(pbar): - pbar.set_description('Processing at epoch %d'%epoch) + pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data - _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) - alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32) + _, _, attn_probs, _, _, _ = transformerTTS( + character, mel_input, pos_text, pos_mel) + alignment = dg.to_variable( + get_alignment(attn_probs, mel_lens, cfg[ + 'transformer_head'])).astype(np.float32) global_step += 1 - + #Forward - result= model(character, - pos_text, - mel_pos=pos_mel, - length_target=alignment) + result = model( + character, + pos_text, + mel_pos=pos_mel, + length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) - duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment))) + duration_loss = layers.mean( + layers.abs( + layers.elementwise_sub(duration_predictor_output, + alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss - if local_rank==0: - writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) - writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) - writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) - writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) - + if local_rank == 0: + writer.add_scalar('mel_loss', + mel_loss.numpy(), global_step) + writer.add_scalar('post_mel_loss', + mel_postnet_loss.numpy(), global_step) + writer.add_scalar('duration_loss', + duration_loss.numpy(), global_step) + writer.add_scalar('learning_rate', + optimizer._learning_rate.step().numpy(), + global_step) if args.use_data_parallel: total_loss = model.scale_loss(total_loss) @@ -108,21 +142,25 @@ def main(args): model.apply_collective_grads() else: total_loss.backward() - optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) + optimizer.minimize( + total_loss, + grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[ + 'grad_clip_thresh'])) model.clear_gradients() - # save checkpoint - if local_rank==0 and global_step % args.save_step == 0: + # save checkpoint + if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) - save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step) + save_path = os.path.join(args.save_path, + 'fastspeech/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) - if local_rank==0: + if local_rank == 0: writer.close() -if __name__ =='__main__': +if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train Fastspeech model") add_config_options_to_parser(parser) args = parser.parse_args() diff --git a/examples/transformer_tts/README.md b/examples/transformer_tts/README.md index afdfdd2..6fda6d1 100644 --- a/examples/transformer_tts/README.md +++ b/examples/transformer_tts/README.md @@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step`` -For more help on arguments: +For more help on arguments: ``python train_transformer.py --help``. ## Train Vocoder @@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ``` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step`` -For more help on arguments: +For more help on arguments: ``python train_vocoder.py --help``. ## Synthesis @@ -101,5 +101,5 @@ sh synthesis.sh And the audio file will be saved in ``--sample_path``. -For more help on arguments: +For more help on arguments: ``python synthesis.py --help``. diff --git a/examples/transformer_tts/data.py b/examples/transformer_tts/data.py index 9401b7b..99c6739 100644 --- a/examples/transformer_tts/data.py +++ b/examples/transformer_tts/data.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pathlib import Path import numpy as np import pandas as pd @@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo from parakeet.data.batch import TextIDBatcher, SpecBatcher from parakeet.data.dataset import DatasetMixin, TransformDataset + class LJSpeechLoader: - def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True): + def __init__(self, + config, + args, + nranks, + rank, + is_vocoder=False, + shuffle=True): place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace() LJSPEECH_ROOT = Path(args.data_path) metadata = LJSpeechMetaData(LJSPEECH_ROOT) transformer = LJSpeech(config) dataset = TransformDataset(metadata, transformer) - sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) + sampler = DistributedSampler( + len(metadata), nranks, rank, shuffle=shuffle) assert args.batch_size % nranks == 0 each_bs = args.batch_size // nranks if is_vocoder: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) + dataloader = DataCargo( + dataset, + sampler=sampler, + batch_size=each_bs, + shuffle=shuffle, + batch_fn=batch_examples_vocoder, + drop_last=True) else: - dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True) - + dataloader = DataCargo( + dataset, + sampler=sampler, + batch_size=each_bs, + shuffle=shuffle, + batch_fn=batch_examples, + drop_last=True) + self.reader = fluid.io.DataLoader.from_generator( capacity=32, iterable=True, @@ -63,13 +96,13 @@ class LJSpeech(object): super(LJSpeech, self).__init__() self.config = config self._ljspeech_processor = audio.AudioProcessor( - sample_rate=config['audio']['sr'], - num_mels=config['audio']['num_mels'], - min_level_db=config['audio']['min_level_db'], - ref_level_db=config['audio']['ref_level_db'], - n_fft=config['audio']['n_fft'], - win_length= config['audio']['win_length'], - hop_length= config['audio']['hop_length'], + sample_rate=config['audio']['sr'], + num_mels=config['audio']['num_mels'], + min_level_db=config['audio']['min_level_db'], + ref_level_db=config['audio']['ref_level_db'], + n_fft=config['audio']['n_fft'], + win_length=config['audio']['win_length'], + hop_length=config['audio']['hop_length'], power=config['audio']['power'], preemphasis=config['audio']['preemphasis'], signal_norm=True, @@ -81,7 +114,7 @@ class LJSpeech(object): griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) - + def __call__(self, metadatum): """All the code for generating an Example from a metadatum. If you want a different preprocessing pipeline, you can override this method. @@ -90,13 +123,15 @@ class LJSpeech(object): method. """ fname, raw_text, normalized_text = metadatum - + # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize wav = self._ljspeech_processor.load_wav(str(fname)) mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) - phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) - return (mag, mel, phonemes) # maybe we need to implement it as a map in the future + phonemes = np.array( + g2p.en.text_to_sequence(normalized_text), dtype=np.int64) + return (mag, mel, phonemes + ) # maybe we need to implement it as a map in the future def batch_examples(batch): @@ -109,44 +144,71 @@ def batch_examples(batch): pos_mels = [] for data in batch: _, mel, text = data - mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) + mel_inputs.append( + np.concatenate( + [np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]], + axis=-1)) mel_lens.append(mel.shape[1]) text_lens.append(len(text)) pos_texts.append(np.arange(1, len(text) + 1)) pos_mels.append(np.arange(1, mel.shape[1] + 1)) mels.append(mel) texts.append(text) - + # Sort by text_len in descending order - texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] - mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] - mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] - mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)] - pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] - pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] + texts = [ + i + for i, _ in sorted( + zip(texts, text_lens), key=lambda x: x[1], reverse=True) + ] + mels = [ + i + for i, _ in sorted( + zip(mels, text_lens), key=lambda x: x[1], reverse=True) + ] + mel_inputs = [ + i + for i, _ in sorted( + zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True) + ] + mel_lens = [ + i + for i, _ in sorted( + zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) + ] + pos_texts = [ + i + for i, _ in sorted( + zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True) + ] + pos_mels = [ + i + for i, _ in sorted( + zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True) + ] text_lens = sorted(text_lens, reverse=True) # Pad sequence with largest len of the batch - texts = TextIDBatcher(pad_id=0)(texts) #(B, T) - pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) - pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) - mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) - return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens)) + texts = TextIDBatcher(pad_id=0)(texts) #(B, T) + pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) + pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) + mels = np.transpose( + SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels) + mel_inputs = np.transpose( + SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), + np.array(mel_lens)) + def batch_examples_vocoder(batch): - mels=[] - mags=[] + mels = [] + mags = [] for data in batch: mag, mel, _ = data mels.append(mel) mags.append(mag) - mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) - mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) + mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) + mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1)) return (mels, mags) - - - - diff --git a/examples/transformer_tts/parse.py b/examples/transformer_tts/parse.py index aebce96..e7f124a 100644 --- a/examples/transformer_tts/parse.py +++ b/examples/transformer_tts/parse.py @@ -1,38 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse + def add_config_options_to_parser(parser): - parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml', + parser.add_argument( + '--config_path', + type=str, + default='config/train_transformer.yaml', help="the yaml config file path.") - parser.add_argument('--batch_size', type=int, default=32, - help="batch size for training.") - parser.add_argument('--epochs', type=int, default=10000, + parser.add_argument( + '--batch_size', type=int, default=32, help="batch size for training.") + parser.add_argument( + '--epochs', + type=int, + default=10000, help="the number of epoch for training.") - parser.add_argument('--lr', type=float, default=0.001, + parser.add_argument( + '--lr', + type=float, + default=0.001, help="the learning rate for training.") - parser.add_argument('--save_step', type=int, default=500, + parser.add_argument( + '--save_step', + type=int, + default=500, help="checkpointing interval during training.") - parser.add_argument('--image_step', type=int, default=2000, + parser.add_argument( + '--image_step', + type=int, + default=2000, help="attention image interval during training.") - parser.add_argument('--max_len', type=int, default=400, + parser.add_argument( + '--max_len', + type=int, + default=400, help="The max length of audio when synthsis.") - parser.add_argument('--transformer_step', type=int, default=160000, + parser.add_argument( + '--transformer_step', + type=int, + default=160000, help="Global step to restore checkpoint of transformer.") - parser.add_argument('--vocoder_step', type=int, default=90000, + parser.add_argument( + '--vocoder_step', + type=int, + default=90000, help="Global step to restore checkpoint of postnet.") - parser.add_argument('--use_gpu', type=int, default=1, + parser.add_argument( + '--use_gpu', + type=int, + default=1, help="use gpu or not during training.") - parser.add_argument('--use_data_parallel', type=int, default=0, + parser.add_argument( + '--use_data_parallel', + type=int, + default=0, help="use data parallel or not during training.") - parser.add_argument('--stop_token', type=int, default=0, + parser.add_argument( + '--stop_token', + type=int, + default=0, help="use stop token loss in network or not.") - parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', + parser.add_argument( + '--data_path', + type=str, + default='./dataset/LJSpeech-1.1', help="the path of dataset.") - parser.add_argument('--checkpoint_path', type=str, default=None, + parser.add_argument( + '--checkpoint_path', + type=str, + default=None, help="the path to load checkpoint or pretrain model.") - parser.add_argument('--save_path', type=str, default='./checkpoint', + parser.add_argument( + '--save_path', + type=str, + default='./checkpoint', help="the path to save checkpoint.") - parser.add_argument('--log_dir', type=str, default='./log', + parser.add_argument( + '--log_dir', + type=str, + default='./log', help="the directory to save tensorboard log.") - parser.add_argument('--sample_path', type=str, default='./sample', + parser.add_argument( + '--sample_path', + type=str, + default='./sample', help="the directory to save audio sample in synthesis.") diff --git a/examples/transformer_tts/synthesis.py b/examples/transformer_tts/synthesis.py index fb1bd2f..de83362 100644 --- a/examples/transformer_tts/synthesis.py +++ b/examples/transformer_tts/synthesis.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from scipy.io.wavfile import write from parakeet.g2p.en import text_to_sequence @@ -16,6 +29,7 @@ from parakeet import audio from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.transformer_tts import TransformerTTS + def load_checkpoint(step, model_path): model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) new_state_dict = OrderedDict() @@ -26,6 +40,7 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict + def synthesis(text_input, args): place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) @@ -34,46 +49,53 @@ def synthesis(text_input, args): # tensorboard if not os.path.exists(args.log_dir): - os.mkdir(args.log_dir) - path = os.path.join(args.log_dir,'synthesis') + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir, 'synthesis') writer = SummaryWriter(path) with dg.guard(place): with fluid.unique_name.guard(): model = TransformerTTS(cfg) - model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))) + model.set_dict( + load_checkpoint( + str(args.transformer_step), + os.path.join(args.checkpoint_path, "transformer"))) model.eval() - + with fluid.unique_name.guard(): model_vocoder = Vocoder(cfg, args.batch_size) - model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))) + model_vocoder.set_dict( + load_checkpoint( + str(args.vocoder_step), + os.path.join(args.checkpoint_path, "vocoder"))) model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) - text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) - mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32) - pos_text = np.arange(1, text.shape[1]+1) - pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) - + text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) + mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) + pos_text = np.arange(1, text.shape[1] + 1) + pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: - pos_mel = np.arange(1, mel_input.shape[1]+1) - pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0]) - mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel) - mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) + pos_mel = np.arange(1, mel_input.shape[1] + 1) + pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( + text, mel_input, pos_text, pos_mel) + mel_input = fluid.layers.concat( + [mel_input, postnet_pred[:, -1:, :]], axis=1) mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( - sample_rate=cfg['audio']['sr'], - num_mels=cfg['audio']['num_mels'], - min_level_db=cfg['audio']['min_level_db'], - ref_level_db=cfg['audio']['ref_level_db'], - n_fft=cfg['audio']['n_fft'], - win_length= cfg['audio']['win_length'], - hop_length= cfg['audio']['hop_length'], + sample_rate=cfg['audio']['sr'], + num_mels=cfg['audio']['num_mels'], + min_level_db=cfg['audio']['min_level_db'], + ref_level_db=cfg['audio']['ref_level_db'], + n_fft=cfg['audio']['n_fft'], + win_length=cfg['audio']['win_length'], + hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, @@ -86,13 +108,18 @@ def synthesis(text_input, args): do_trim_silence=False, sound_norm=False) - wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) + wav = _ljspeech_processor.inv_spectrogram( + fluid.layers.transpose( + fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) if not os.path.exists(args.sample_path): os.mkdir(args.sample_path) - write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav) + write( + os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'], + wav) writer.close() + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Synthesis model") add_config_options_to_parser(parser) diff --git a/examples/transformer_tts/train_transformer.py b/examples/transformer_tts/train_transformer.py index cbca569..f3dd023 100644 --- a/examples/transformer_tts/train_transformer.py +++ b/examples/transformer_tts/train_transformer.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from tqdm import tqdm from tensorboardX import SummaryWriter @@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy from data import LJSpeechLoader from parakeet.models.transformer_tts.transformer_tts import TransformerTTS + def load_checkpoint(step, model_path): - model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) + model_dict, opti_dict = fluid.dygraph.load_dygraph( + os.path.join(model_path, step)) new_state_dict = OrderedDict() for param in model_dict: if param.startswith('_layers.'): @@ -40,22 +55,27 @@ def main(args): if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): - os.mkdir(args.log_dir) - path = os.path.join(args.log_dir,'transformer') + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir, 'transformer') writer = SummaryWriter(path) if local_rank == 0 else None - + with dg.guard(place): model = TransformerTTS(cfg) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), - parameter_list=model.parameters()) - - reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=dg.NoamDecay(1 / ( + cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), + parameter_list=model.parameters()) + + reader = LJSpeechLoader( + cfg, args, nranks, local_rank, shuffle=True).reader() if args.checkpoint_path is not None: - model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")) + model_dict, opti_dict = load_checkpoint( + str(args.transformer_step), + os.path.join(args.checkpoint_path, "transformer")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.transformer_step @@ -64,86 +84,112 @@ def main(args): if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) - + for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): - pbar.set_description('Processing at epoch %d'%epoch) + pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel, text_length, _ = data global_step += 1 - mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) - + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( + character, mel_input, pos_text, pos_mel) label = (pos_mel == 0).astype(np.float32) - - mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) - post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) + + mel_loss = layers.mean( + layers.abs(layers.elementwise_sub(mel_pred, mel))) + post_mel_loss = layers.mean( + layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if args.stop_token: stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss - if local_rank==0: + if local_rank == 0: writer.add_scalars('training_loss', { - 'mel_loss':mel_loss.numpy(), - 'post_mel_loss':post_mel_loss.numpy() + 'mel_loss': mel_loss.numpy(), + 'post_mel_loss': post_mel_loss.numpy() }, global_step) if args.stop_token: - writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) + writer.add_scalar('stop_loss', + stop_loss.numpy(), global_step) if args.use_data_parallel: writer.add_scalars('alphas', { - 'encoder_alpha':model._layers.encoder.alpha.numpy(), - 'decoder_alpha':model._layers.decoder.alpha.numpy(), + 'encoder_alpha': + model._layers.encoder.alpha.numpy(), + 'decoder_alpha': + model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars('alphas', { - 'encoder_alpha':model.encoder.alpha.numpy(), - 'decoder_alpha':model.decoder.alpha.numpy(), + 'encoder_alpha': model.encoder.alpha.numpy(), + 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) - writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) + writer.add_scalar('learning_rate', + optimizer._learning_rate.step().numpy(), + global_step) if global_step % args.image_step == 1: for i, prob in enumerate(attn_probs): for j in range(4): - x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC") + x = np.uint8( + cm.viridis(prob.numpy()[j * 16]) * 255) + writer.add_image( + 'Attention_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): - x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") + x = np.uint8( + cm.viridis(prob.numpy()[j * 16]) * 255) + writer.add_image( + 'Attention_enc_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(4): - x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) - writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") - + x = np.uint8( + cm.viridis(prob.numpy()[j * 16]) * 255) + writer.add_image( + 'Attention_dec_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() - optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) + optimizer.minimize( + loss, + grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[ + 'grad_clip_thresh'])) model.clear_gradients() - + # save checkpoint - if local_rank==0 and global_step % args.save_step == 0: + if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) - save_path = os.path.join(args.save_path,'transformer/%d' % global_step) + save_path = os.path.join(args.save_path, + 'transformer/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) - if local_rank==0: + if local_rank == 0: writer.close() - -if __name__ =='__main__': + +if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train TransformerTTS model") add_config_options_to_parser(parser) diff --git a/examples/transformer_tts/train_vocoder.py b/examples/transformer_tts/train_vocoder.py index 857fdf0..7896223 100644 --- a/examples/transformer_tts/train_vocoder.py +++ b/examples/transformer_tts/train_vocoder.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from tensorboardX import SummaryWriter import os from tqdm import tqdm @@ -13,6 +26,7 @@ import paddle.fluid.layers as layers from data import LJSpeechLoader from parakeet.models.transformer_tts.vocoder import Vocoder + def load_checkpoint(step, model_path): model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) new_state_dict = OrderedDict() @@ -23,8 +37,9 @@ def load_checkpoint(step, model_path): new_state_dict[param] = model_dict[param] return new_state_dict, opti_dict + def main(args): - + local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 @@ -35,23 +50,26 @@ def main(args): place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) - + if not os.path.exists(args.log_dir): - os.mkdir(args.log_dir) - path = os.path.join(args.log_dir,'vocoder') + os.mkdir(args.log_dir) + path = os.path.join(args.log_dir, 'vocoder') writer = SummaryWriter(path) if local_rank == 0 else None - with dg.guard(place): + with dg.guard(place): model = Vocoder(cfg, args.batch_size) model.train() - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), - parameter_list=model.parameters()) - + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=dg.NoamDecay(1 / ( + cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), + parameter_list=model.parameters()) if args.checkpoint_path is not None: - model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")) + model_dict, opti_dict = load_checkpoint( + str(args.vocoder_step), + os.path.join(args.checkpoint_path, "vocoder")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.vocoder_step @@ -61,48 +79,55 @@ def main(args): strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) - reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader() + reader = LJSpeechLoader( + cfg, args, nranks, local_rank, is_vocoder=True).reader() for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): - pbar.set_description('Processing at epoch %d'%epoch) + pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) - loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) - + loss = layers.mean( + layers.abs(layers.elementwise_sub(mag_pred, mag))) + if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() - optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) + optimizer.minimize( + loss, + grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[ + 'grad_clip_thresh'])) model.clear_gradients() - - if local_rank==0: - writer.add_scalars('training_loss',{ - 'loss':loss.numpy(), + + if local_rank == 0: + writer.add_scalars('training_loss', { + 'loss': loss.numpy(), }, global_step) if global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) - save_path = os.path.join(args.save_path,'vocoder/%d' % global_step) + save_path = os.path.join(args.save_path, + 'vocoder/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) - if local_rank==0: + if local_rank == 0: writer.close() + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train vocoder model") add_config_options_to_parser(parser) args = parser.parse_args() # Print the whole config setting. pprint(args) - main(args) \ No newline at end of file + main(args) diff --git a/examples/waveflow/benchmark.py b/examples/waveflow/benchmark.py index 24d83c4..3badeda 100644 --- a/examples/waveflow/benchmark.py +++ b/examples/waveflow/benchmark.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import random from pprint import pprint diff --git a/examples/waveflow/synthesis.py b/examples/waveflow/synthesis.py index 76df229..0647e94 100644 --- a/examples/waveflow/synthesis.py +++ b/examples/waveflow/synthesis.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import random from pprint import pprint diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py index 92bb9ef..32059c8 100644 --- a/examples/waveflow/train.py +++ b/examples/waveflow/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import random import subprocess diff --git a/examples/waveflow/utils.py b/examples/waveflow/utils.py index 51f6296..da9b4ba 100644 --- a/examples/waveflow/utils.py +++ b/examples/waveflow/utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import os import time diff --git a/parakeet/__init__.py b/parakeet/__init__.py index 9dbb99b..9be1aaf 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + __version__ = "0.0.0" from . import data, g2p, models, modules diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py index 6212dee..253a887 100644 --- a/parakeet/audio/__init__.py +++ b/parakeet/audio/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .audio import AudioProcessor \ No newline at end of file diff --git a/parakeet/audio/audio.py b/parakeet/audio/audio.py index b861a39..9133a47 100644 --- a/parakeet/audio/audio.py +++ b/parakeet/audio/audio.py @@ -1,30 +1,46 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import librosa import soundfile as sf import numpy as np import scipy.io import scipy.signal + class AudioProcessor(object): - def __init__(self, - sample_rate=None, # int, sampling rate - num_mels=None, # int, bands of mel spectrogram - min_level_db=None, # float, minimum level db - ref_level_db=None, # float, reference level db - n_fft=None, # int: number of samples in a frame for stft - win_length=None, # int: the same meaning with n_fft - hop_length=None, # int: number of samples between neighboring frame - power=None, # float:power to raise before griffin-lim - preemphasis=None, # float: preemphasis coefficident - signal_norm=None, # - symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] - max_norm=None, # float, max norm - mel_fmin=None, # int: mel spectrogram's minimum frequency - mel_fmax=None, # int: mel spectrogram's maximum frequency - clip_norm=True, # bool: clip spectrogram's norm - griffin_lim_iters=None, # int: - do_trim_silence=False, # bool: trim silence - sound_norm=False, - **kwargs): + def __init__( + self, + sample_rate=None, # int, sampling rate + num_mels=None, # int, bands of mel spectrogram + min_level_db=None, # float, minimum level db + ref_level_db=None, # float, reference level db + n_fft=None, # int: number of samples in a frame for stft + win_length=None, # int: the same meaning with n_fft + hop_length=None, # int: number of samples between neighboring frame + power=None, # float:power to raise before griffin-lim + preemphasis=None, # float: preemphasis coefficident + signal_norm=None, # + symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] + max_norm=None, # float, max norm + mel_fmin=None, # int: mel spectrogram's minimum frequency + mel_fmax=None, # int: mel spectrogram's maximum frequency + clip_norm=True, # bool: clip spectrogram's norm + griffin_lim_iters=None, # int: + do_trim_silence=False, # bool: trim silence + sound_norm=False, + **kwargs): self.sample_rate = sample_rate self.num_mels = num_mels self.min_level_db = min_level_db @@ -34,8 +50,8 @@ class AudioProcessor(object): self.n_fft = n_fft self.win_length = win_length or n_fft # hop length defaults to 1/4 window_length - self.hop_length = hop_length or 0.25 * self.win_length - + self.hop_length = hop_length or 0.25 * self.win_length + self.power = power self.preemphasis = float(preemphasis) @@ -52,7 +68,8 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.sound_norm = sound_norm - self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters() + self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters( + ) def _stft_parameters(self): """compute frame length and hop length in ms""" @@ -65,44 +82,54 @@ class AudioProcessor(object): """object repr""" cls_name_str = self.__class__.__name__ members = vars(self) - dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()]) + dict_str = "\n".join( + [" {}: {},".format(k, v) for k, v in members.items()]) repr_str = "{}(\n{})\n".format(cls_name_str, dict_str) return repr_str def save_wav(self, path, wav): """save audio with scipy.io.wavfile in 16bit integers""" wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16)) + scipy.io.wavfile.write(path, self.sample_rate, + wav_norm.as_type(np.int16)) def load_wav(self, path, sr=None): """load wav -> trim_silence -> rescale""" x, sr = librosa.load(path, sr=None) - assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate) + assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format( + sr, self.sample_rate) if self.do_trim_silence: try: x = self.trim_silence(x) except ValueError: - print(" [!] File cannot be trimmed for silence - {}".format(path)) + print(" [!] File cannot be trimmed for silence - {}".format( + path)) if self.sound_norm: - x = x / x.max() * 0.9 # why 0.9 ? + x = x / x.max() * 0.9 # why 0.9 ? return x def trim_silence(self, wav): """Trim soilent parts with a threshold and 0.01s margin""" margin = int(self.sample_rate * 0.01) - wav = wav[margin: -margin] - trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] + wav = wav[margin:-margin] + trimed_wav = librosa.effects.trim( + wav, + top_db=60, + frame_length=self.win_length, + hop_length=self.hop_length)[0] return trimed_wav def apply_preemphasis(self, x): if self.preemphasis == 0.: - raise RuntimeError(" !! Preemphasis coefficient should be positive. ") + raise RuntimeError( + " !! Preemphasis coefficient should be positive. ") return scipy.signal.lfilter([1., -self.preemphasis], [1.], x) def apply_inv_preemphasis(self, x): if self.preemphasis == 0.: - raise RuntimeError(" !! Preemphasis coefficient should be positive. ") + raise RuntimeError( + " !! Preemphasis coefficient should be positive. ") return scipy.signal.lfilter([1.], [1., -self.preemphasis], x) def _amplitude_to_db(self, x): @@ -125,12 +152,11 @@ class AudioProcessor(object): """return mel basis for mel scale""" if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 - return librosa.filters.mel( - self.sample_rate, - self.n_fft, - n_mels=self.num_mels, - fmin=self.mel_fmin, - fmax=self.mel_fmax) + return librosa.filters.mel(self.sample_rate, + self.n_fft, + n_mels=self.num_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax) def _normalize(self, S): """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]""" @@ -156,25 +182,29 @@ class AudioProcessor(object): if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) - S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db + S_denorm = (S_denorm + self.max_norm) * ( + -self.min_level_db) / (2 * self.max_norm + ) + self.min_level_db return S_denorm else: if self.clip_norm: S_denorm = np.clip(S_denorm, 0, self.max_norm) - S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db + S_denorm = S_denorm * (-self.min_level_db + ) / self.max_norm + self.min_level_db return S_denorm else: return S def _stft(self, y): return librosa.stft( - y=y, + y=y, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length) def _istft(self, S): - return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length) + return librosa.istft( + S, hop_length=self.hop_length, win_length=self.win_length) def spectrogram(self, y): """compute linear spectrogram(amplitude) @@ -195,7 +225,8 @@ class AudioProcessor(object): D = self._stft(self.apply_preemphasis(y)) else: D = self._stft(y) - S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db + S = self._amplitude_to_db(self._linear_to_mel(np.abs( + D))) - self.ref_level_db return self._normalize(S) def inv_spectrogram(self, spectrogram): @@ -203,16 +234,16 @@ class AudioProcessor(object): S = self._denormalize(spectrogram) S = self._db_to_amplitude(S + self.ref_level_db) if self.preemphasis: - return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) - return self._griffin_lim(S ** self.power) + return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) + return self._griffin_lim(S**self.power) def inv_melspectrogram(self, mel_spectrogram): S = self._denormalize(mel_spectrogram) S = self._db_to_amplitude(S + self.ref_level_db) S = self._mel_to_linear(np.abs(S)) if self.preemphasis: - return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) - return self._griffin_lim(S ** self.power) + return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) + return self._griffin_lim(S**self.power) def out_linear_to_mel(self, linear_spec): """convert output linear spec to mel spec""" @@ -222,7 +253,7 @@ class AudioProcessor(object): S = self._amplitude_to_db(S) - self.ref_level_db mel = self._normalize(S) return mel - + def _griffin_lim(self, S): angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) S_complex = np.abs(S).astype(np.complex) @@ -234,18 +265,18 @@ class AudioProcessor(object): @staticmethod def mulaw_encode(wav, qc): - mu = 2 ** qc - 1 + mu = 2**qc - 1 # wav_abs = np.minimum(np.abs(wav), 1.0) signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) # Quantize signal to the specified number of levels. signal = (signal + 1) / 2 * mu + 0.5 - return np.floor(signal,) + return np.floor(signal, ) @staticmethod def mulaw_decode(wav, qc): """Recovers waveform from quantized values.""" - mu = 2 ** qc - 1 - x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) + mu = 2**qc - 1 + x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1) return x @staticmethod diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py index ed86edd..be28f11 100644 --- a/parakeet/data/__init__.py +++ b/parakeet/data/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .dataset import * from .datacargo import * from .sampler import * diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 8777472..22c24e4 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -1,18 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ functions to make batch for arrays which satisfy some conditions. """ import numpy as np + class TextIDBatcher(object): """A wrapper class for a function to build a functor, which holds the configs to pass to the function.""" + def __init__(self, pad_id=0, dtype=np.int64): self.pad_id = pad_id self.dtype = dtype - + def __call__(self, minibatch): out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype) return out + def batch_text_id(minibatch, pad_id=0, dtype=np.int64): """ minibatch: List[Example] @@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64): """ peek_example = minibatch[0] assert len(peek_example.shape) == 1, "text example is an 1D tensor" - - lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, ) + + lengths = [example.shape[0] for example in minibatch + ] # assume (channel, n_samples) or (n_samples, ) max_len = np.max(lengths) - + batch = [] for example in minibatch: pad_len = max_len - example.shape[0] - batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id)) + batch.append( + np.pad(example, [(0, pad_len)], + mode='constant', + constant_values=pad_id)) return np.array(batch, dtype=dtype) + class WavBatcher(object): def __init__(self, pad_value=0., dtype=np.float32): self.pad_value = pad_value self.dtype = dtype - + def __call__(self, minibatch): out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype) return out + def batch_wav(minibatch, pad_value=0., dtype=np.float32): """ minibatch: List[Example] @@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): mono_channel = True elif len(peek_example.shape) == 2: mono_channel = False - - lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, ) + + lengths = [example.shape[-1] for example in minibatch + ] # assume (channel, n_samples) or (n_samples, ) max_len = np.max(lengths) - + batch = [] for example in minibatch: pad_len = max_len - example.shape[-1] if mono_channel: - batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value)) + batch.append( + np.pad(example, [(0, pad_len)], + mode='constant', + constant_values=pad_value)) else: - batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no - + batch.append( + np.pad(example, [(0, 0), (0, pad_len)], + mode='constant', + constant_values=pad_value)) # what about PCM, no + return np.array(batch, dtype=dtype) @@ -75,6 +104,7 @@ class SpecBatcher(object): out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype) return out + def batch_spec(minibatch, pad_value=0., dtype=np.float32): """ minibatch: List[Example] @@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): mono_channel = True elif len(peek_example.shape) == 3: mono_channel = False - - lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame) - max_len = np.max(lengths) - + + lengths = [example.shape[-1] for example in minibatch + ] # assume (channel, F, n_frame) or (F, n_frame) + max_len = np.max(lengths) + batch = [] for example in minibatch: pad_len = max_len - example.shape[-1] if mono_channel: - batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) + batch.append( + np.pad(example, [(0, 0), (0, pad_len)], + mode='constant', + constant_values=pad_value)) else: - batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no - - return np.array(batch, dtype=dtype) \ No newline at end of file + batch.append( + np.pad(example, [(0, 0), (0, 0), (0, pad_len)], + mode='constant', + constant_values=pad_value)) # what about PCM, no + + return np.array(batch, dtype=dtype) diff --git a/parakeet/data/datacargo.py b/parakeet/data/datacargo.py index 8c9a3b2..904cd3c 100644 --- a/parakeet/data/datacargo.py +++ b/parakeet/data/datacargo.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import six from .sampler import SequentialSampler, RandomSampler, BatchSampler diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index d9f9a1f..d577f9e 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import six import numpy as np @@ -9,8 +23,7 @@ class DatasetMixin(object): if isinstance(index, slice): start, stop, step = index.indices(len(self)) return [ - self.get_example(i) - for i in six.moves.range(start, stop, step) + self.get_example(i) for i in six.moves.range(start, stop, step) ] elif isinstance(index, (list, np.ndarray)): return [self.get_example(i) for i in index] @@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin): def get_example(self, i): if i < 0: - raise IndexError( - "ChainDataset doesnot support negative indexing.") + raise IndexError("ChainDataset doesnot support negative indexing.") for dataset in self._datasets: if i < len(dataset): diff --git a/parakeet/data/sampler.py b/parakeet/data/sampler.py index 60aa5db..b4ef097 100644 --- a/parakeet/data/sampler.py +++ b/parakeet/data/sampler.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__. @@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices. So the sampler is only responsible for generating valid indices. """ - import numpy as np import random + class Sampler(object): def __init__(self, data_source): pass @@ -23,7 +36,7 @@ class Sampler(object): class SequentialSampler(Sampler): def __init__(self, data_source): self.data_source = data_source - + def __iter__(self): return iter(range(len(self.data_source))) @@ -42,12 +55,14 @@ class RandomSampler(Sampler): "replacement={}".format(self.replacement)) if self._num_samples is not None and not replacement: - raise ValueError("With replacement=False, num_samples should not be specified, " - "since a random permutation will be performed.") + raise ValueError( + "With replacement=False, num_samples should not be specified, " + "since a random permutation will be performed.") if not isinstance(self.num_samples, int) or self.num_samples <= 0: raise ValueError("num_samples should be a positive integer " - "value, but got num_samples={}".format(self.num_samples)) + "value, but got num_samples={}".format( + self.num_samples)) @property def num_samples(self): @@ -59,7 +74,9 @@ class RandomSampler(Sampler): def __iter__(self): n = len(self.data_source) if self.replacement: - return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist()) + return iter( + np.random.randint( + 0, n, size=(self.num_samples, ), dtype=np.int64).tolist()) return iter(np.random.permutation(n).tolist()) def __len__(self): @@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler): self.indices = indices def __iter__(self): - return (self.indices[i] for i in np.random.permutation(len(self.indices))) + return (self.indices[i] + for i in np.random.permutation(len(self.indices))) def __len__(self): return len(self.indices) @@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): 3. Permutate mini-batchs """ - def __init__(self, lengths, batch_size=4, batch_group_size=None, + def __init__(self, + lengths, + batch_size=4, + batch_group_size=None, permutate=True): - _lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key + _lengths = np.array( + lengths, + dtype=np.int64) # maybe better implement length as a sort key self.lengths = np.sort(_lengths) self.sorted_indices = np.argsort(_lengths) @@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): for i in range(len(indices) // batch_group_size): s = i * batch_group_size e = s + batch_group_size - random.shuffle(indices[s: e]) # inplace + random.shuffle(indices[s:e]) # inplace # Permutate batches if self.permutate: perm = np.arange(len(indices[:e]) // self.batch_size) random.shuffle(perm) - indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1) + indices[:e] = indices[:e].reshape( + -1, self.batch_size)[perm, :].reshape(-1) # Handle last elements s += batch_group_size #print(indices) if s < len(indices): random.shuffle(indices[s:]) - + return iter(indices) def __len__(self): @@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler): def __init__(self, weights, num_samples, replacement): if not isinstance(num_samples, int) or num_samples <= 0: raise ValueError("num_samples should be a positive integer " - "value, but got num_samples={}".format(num_samples)) + "value, but got num_samples={}".format( + num_samples)) self.weights = np.array(weights, dtype=np.float64) self.num_samples = num_samples self.replacement = replacement def __iter__(self): - return iter(np.random.choice(len(self.weights), size=(self.num_samples, ), - replace=self.replacement, p=self.weights).tolist()) + return iter( + np.random.choice( + len(self.weights), + size=(self.num_samples, ), + replace=self.replacement, + p=self.weights).tolist()) def __len__(self): return self.num_samples @@ -184,7 +213,7 @@ class DistributedSampler(Sampler): # Subset samples for each trainer. indices = indices[self.rank:self.total_size:self.num_trainers] - assert len(indices) == self.num_samples + assert len(indices) == self.num_samples return iter(indices) @@ -209,8 +238,7 @@ class BatchSampler(Sampler): def __init__(self, sampler, batch_size, drop_last): if not isinstance(sampler, Sampler): raise ValueError("sampler should be an instance of " - "Sampler, but got sampler={}" - .format(sampler)) + "Sampler, but got sampler={}".format(sampler)) if not isinstance(batch_size, int) or batch_size <= 0: raise ValueError("batch_size should be a positive integer value, " "but got batch_size={}".format(batch_size)) diff --git a/parakeet/datasets/README.md b/parakeet/datasets/README.md index 96509ca..cd4f8f4 100644 --- a/parakeet/datasets/README.md +++ b/parakeet/datasets/README.md @@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`. -That is it! - - - - - +That is it! diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py index e69de29..abf198b 100644 --- a/parakeet/datasets/__init__.py +++ b/parakeet/datasets/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py index 7d4dffe..62209e9 100644 --- a/parakeet/datasets/ljspeech.py +++ b/parakeet/datasets/ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path import numpy as np import pandas as pd diff --git a/parakeet/datasets/vctk.py b/parakeet/datasets/vctk.py index b6d2f0c..66e4f70 100644 --- a/parakeet/datasets/vctk.py +++ b/parakeet/datasets/vctk.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path import pandas as pd from ruamel.yaml import YAML @@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset from parakeet.data.datacargo import DataCargo from parakeet.data.batch import TextIDBatcher, WavBatcher + class VCTK(Dataset): def __init__(self, root): - assert isinstance(root, (str, Path)), "root should be a string or Path object" + assert isinstance(root, ( + str, Path)), "root should be a string or Path object" self.root = root if isinstance(root, Path) else Path(root) self.text_root = self.root.joinpath("txt") self.wav_root = self.root.joinpath("wav48") - if not (self.root.joinpath("metadata.csv").exists() and + if not (self.root.joinpath("metadata.csv").exists() and self.root.joinpath("speaker_indices.yaml").exists()): self._prepare_metadata() self.speaker_indices, self.metadata = self._load_metadata() def _load_metadata(self): - yaml=YAML(typ='safe') + yaml = YAML(typ='safe') speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml")) - metadata = pd.read_csv(self.root.joinpath("metadata.csv"), - sep="|", quoting=3, header=1) + metadata = pd.read_csv( + self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1) return speaker_indices, metadata def _prepare_metadata(self): @@ -41,15 +57,19 @@ class VCTK(Dataset): with io.open(str(text_file)) as f: transcription = f.read().strip() wav_file = text_file.with_suffix(".wav") - metadata.append((wav_file.name, speaker_folder.name, transcription)) - metadata = pd.DataFrame.from_records(metadata, - columns=["wave_file", "speaker", "text"]) - + metadata.append( + (wav_file.name, speaker_folder.name, transcription)) + metadata = pd.DataFrame.from_records( + metadata, columns=["wave_file", "speaker", "text"]) + # save them - yaml=YAML(typ='safe') + yaml = YAML(typ='safe') yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml")) - metadata.to_csv(self.root.joinpath("metadata.csv"), - sep="|", quoting=3, index=False) + metadata.to_csv( + self.root.joinpath("metadata.csv"), + sep="|", + quoting=3, + index=False) def _get_example(self, metadatum): wave_file, speaker, text = metadatum @@ -77,5 +97,3 @@ class VCTK(Dataset): speaker_batch = np.array(speaker_batch) phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) return wav_batch, speaker_batch, phoneme_batch - - \ No newline at end of file diff --git a/parakeet/g2p/__init__.py b/parakeet/g2p/__init__.py index 2b88bdc..5840f33 100644 --- a/parakeet/g2p/__init__.py +++ b/parakeet/g2p/__init__.py @@ -1,5 +1,4 @@ # coding: utf-8 - """Text processing frontend All frontend module should have the following functions: diff --git a/parakeet/g2p/en/__init__.py b/parakeet/g2p/en/__init__.py index 92faf11..01dd223 100644 --- a/parakeet/g2p/en/__init__.py +++ b/parakeet/g2p/en/__init__.py @@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0): from ..text import text_to_sequence text = text_to_sequence(text, ["english_cleaners"]) return text - - - diff --git a/parakeet/g2p/es/__init__.py b/parakeet/g2p/es/__init__.py index fce4d18..8ac385f 100644 --- a/parakeet/g2p/es/__init__.py +++ b/parakeet/g2p/es/__init__.py @@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0): from ..text import text_to_sequence text = text_to_sequence(text, ["basic_cleaners"]) return text - - - diff --git a/parakeet/g2p/jp/__init__.py b/parakeet/g2p/jp/__init__.py index dcb0845..36c7fd8 100644 --- a/parakeet/g2p/jp/__init__.py +++ b/parakeet/g2p/jp/__init__.py @@ -1,6 +1,5 @@ # coding: utf-8 - import MeCab import jaconv from random import random @@ -30,9 +29,9 @@ def _yomi(mecab_result): def _mix_pronunciation(tokens, yomis, p): - return "".join( - yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx] - for idx in range(len(tokens))) + return "".join(yomis[idx] + if yomis[idx] is not None and random() < p else tokens[idx] + for idx in range(len(tokens))) def mix_pronunciation(text, p): @@ -59,8 +58,7 @@ def normalize_delimitor(text): def text_to_sequence(text, p=0.0): - for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", - "(", ")", "(", ")"]: + for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]: text = text.replace(c, "") text = text.replace("!", "!") text = text.replace("?", "?") diff --git a/parakeet/g2p/ko/__init__.py b/parakeet/g2p/ko/__init__.py index 2a6465b..ccb8b5f 100644 --- a/parakeet/g2p/ko/__init__.py +++ b/parakeet/g2p/ko/__init__.py @@ -1,6 +1,5 @@ # coding: utf-8 - from random import random n_vocab = 0xffff @@ -13,5 +12,6 @@ _tagger = None def text_to_sequence(text, p=0.0): return [ord(c) for c in text] + [_eos] # EOS + def sequence_to_text(seq): return "".join(chr(n) for n in seq) diff --git a/parakeet/g2p/text/__init__.py b/parakeet/g2p/text/__init__.py index 3942998..312b720 100644 --- a/parakeet/g2p/text/__init__.py +++ b/parakeet/g2p/text/__init__.py @@ -1,8 +1,21 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import re from . import cleaners from .symbols import symbols - # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} @@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names): if not m: sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) break - sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) + sequence += _symbols_to_sequence( + _clean_text(m.group(1), cleaner_names)) sequence += _arpabet_to_sequence(m.group(2)) text = m.group(3) diff --git a/parakeet/g2p/text/cleaners.py b/parakeet/g2p/text/cleaners.py index 779a977..58553c1 100644 --- a/parakeet/g2p/text/cleaners.py +++ b/parakeet/g2p/text/cleaners.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ''' Cleaners are transformations that run over the input text at both training and eval time. @@ -14,31 +27,31 @@ import re from unidecode import unidecode from .numbers import normalize_numbers - # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') # List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) + for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), + ]] def expand_abbreviations(text): diff --git a/parakeet/g2p/text/cmudict.py b/parakeet/g2p/text/cmudict.py index 1f1ea9b..bbe7903 100644 --- a/parakeet/g2p/text/cmudict.py +++ b/parakeet/g2p/text/cmudict.py @@ -1,14 +1,28 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import re - valid_symbols = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', - 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', - 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', - 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', - 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', - 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', - 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' + 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', + 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', + 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', + 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', + 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', + 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', + 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', + 'Y', 'Z', 'ZH' ] _valid_symbol_set = set(valid_symbols) @@ -24,7 +38,10 @@ class CMUDict: else: entries = _parse_cmudict(file_or_path) if not keep_ambiguous: - entries = {word: pron for word, pron in entries.items() if len(pron) == 1} + entries = { + word: pron + for word, pron in entries.items() if len(pron) == 1 + } self._entries = entries def __len__(self): diff --git a/parakeet/g2p/text/numbers.py b/parakeet/g2p/text/numbers.py index 93f676d..24b5817 100644 --- a/parakeet/g2p/text/numbers.py +++ b/parakeet/g2p/text/numbers.py @@ -3,7 +3,6 @@ import inflect import re - _inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') @@ -56,7 +55,8 @@ def _expand_number(m): elif num % 100 == 0: return _inflect.number_to_words(num // 100) + ' hundred' else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + return _inflect.number_to_words( + num, andword='', zero='oh', group=2).replace(', ', ' ') else: return _inflect.number_to_words(num, andword='') diff --git a/parakeet/g2p/text/symbols.py b/parakeet/g2p/text/symbols.py index da87c93..299ca58 100644 --- a/parakeet/g2p/text/symbols.py +++ b/parakeet/g2p/text/symbols.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ''' Defines the set of symbols used in text input to the model. diff --git a/parakeet/models/__init__.py b/parakeet/models/__init__.py index e69de29..abf198b 100644 --- a/parakeet/models/__init__.py +++ b/parakeet/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/parakeet/models/deepvoice3/__init__.py b/parakeet/models/deepvoice3/__init__.py index 0430987..86f91e0 100644 --- a/parakeet/models/deepvoice3/__init__.py +++ b/parakeet/models/deepvoice3/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec from parakeet.models.deepvoice3.decoder import Decoder, WindowRange from parakeet.models.deepvoice3.converter import Converter diff --git a/parakeet/models/deepvoice3/attention.py b/parakeet/models/deepvoice3/attention.py index 8f2c2c5..33ffc11 100644 --- a/parakeet/models/deepvoice3/attention.py +++ b/parakeet/models/deepvoice3/attention.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from collections import namedtuple from paddle import fluid @@ -19,23 +33,19 @@ class Attention(dg.Layer): value_projection=True): super(Attention, self).__init__() std = np.sqrt(1 / query_dim) - self.query_proj = Linear(query_dim, - embed_dim, - param_attr=I.Normal(scale=std)) + self.query_proj = Linear( + query_dim, embed_dim, param_attr=I.Normal(scale=std)) if key_projection: std = np.sqrt(1 / embed_dim) - self.key_proj = Linear(embed_dim, - embed_dim, - param_attr=I.Normal(scale=std)) + self.key_proj = Linear( + embed_dim, embed_dim, param_attr=I.Normal(scale=std)) if value_projection: std = np.sqrt(1 / embed_dim) - self.value_proj = Linear(embed_dim, - embed_dim, - param_attr=I.Normal(scale=std)) + self.value_proj = Linear( + embed_dim, embed_dim, param_attr=I.Normal(scale=std)) std = np.sqrt(1 / embed_dim) - self.out_proj = Linear(embed_dim, - query_dim, - param_attr=I.Normal(scale=std)) + self.out_proj = Linear( + embed_dim, query_dim, param_attr=I.Normal(scale=std)) self.key_projection = key_projection self.value_projection = value_projection @@ -102,9 +112,8 @@ class Attention(dg.Layer): x = F.softmax(x) attn_scores = x - x = F.dropout(x, - self.dropout, - dropout_implementation="upscale_in_train") + x = F.dropout( + x, self.dropout, dropout_implementation="upscale_in_train") x = F.matmul(x, values) encoder_length = keys.shape[1] # CAUTION: is it wrong? let it be now diff --git a/parakeet/models/deepvoice3/conv1dglu.py b/parakeet/models/deepvoice3/conv1dglu.py index 23f0109..584c3d7 100644 --- a/parakeet/models/deepvoice3/conv1dglu.py +++ b/parakeet/models/deepvoice3/conv1dglu.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from paddle import fluid @@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer): has residual connection from the input x, and scale the output by np.sqrt(0.5). """ + def __init__(self, n_speakers, speaker_dim, @@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer): ), "this block uses residual connection"\ "the input_channes should equals num_filters" std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels)) - self.conv = Conv1DCell(in_channels, - 2 * num_filters, - filter_size, - dilation, - causal, - param_attr=I.Normal(scale=std)) + self.conv = Conv1DCell( + in_channels, + 2 * num_filters, + filter_size, + dilation, + causal, + param_attr=I.Normal(scale=std)) if n_speakers > 1: assert (speaker_dim is not None ), "speaker embed should not be null in multi-speaker case" std = np.sqrt(1 / speaker_dim) - self.fc = Linear(speaker_dim, - num_filters, - param_attr=I.Normal(scale=std)) + self.fc = Linear( + speaker_dim, num_filters, param_attr=I.Normal(scale=std)) def forward(self, x, speaker_embed=None): """ @@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer): C_out means the output channels of Conv1DGLU. """ residual = x - x = F.dropout(x, - self.dropout, - dropout_implementation="upscale_in_train") + x = F.dropout( + x, self.dropout, dropout_implementation="upscale_in_train") x = self.conv(x) content, gate = F.split(x, num_or_sections=2, dim=1) @@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer): C_out means the output channels of Conv1DGLU. """ residual = x_t - x_t = F.dropout(x_t, - self.dropout, - dropout_implementation="upscale_in_train") + x_t = F.dropout( + x_t, self.dropout, dropout_implementation="upscale_in_train") x_t = self.conv.add_input(x_t) content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1) diff --git a/parakeet/models/deepvoice3/converter.py b/parakeet/models/deepvoice3/converter.py index 7f94805..5181a5c 100644 --- a/parakeet/models/deepvoice3/converter.py +++ b/parakeet/models/deepvoice3/converter.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from itertools import chain @@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): 2, stride=2, param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))), - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=1, - std_mul=1., - dropout=dropout), - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=3, - std_mul=4., - dropout=dropout), - Conv1DTranspose( + Conv1DGLU( + n_speakers, + speaker_dim, target_channels, target_channels, - 2, - stride=2, - param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))), - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=1, - std_mul=1., - dropout=dropout), - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=3, - std_mul=4., - dropout=dropout) + 3, + dilation=1, + std_mul=1., + dropout=dropout), Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=3, + std_mul=4., + dropout=dropout), Conv1DTranspose( + target_channels, + target_channels, + 2, + stride=2, + param_attr=I.Normal(scale=np.sqrt( + 4. / (2 * target_channels)))), Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=1, + std_mul=1., + dropout=dropout), Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=3, + std_mul=4., + dropout=dropout) ] return upsampling_convolutions @@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): 2, stride=2, param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))), - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=1, - std_mul=1., - dropout=dropout), - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=3, - std_mul=4., - dropout=dropout) + Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=1, + std_mul=1., + dropout=dropout), Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=3, + std_mul=4., + dropout=dropout) ] return upsampling_convolutions def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout): upsampling_convolutions = [ - Conv1DGLU(n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=3, - std_mul=4., - dropout=dropout) + Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=3, + std_mul=4., + dropout=dropout) ] return upsampling_convolutions @@ -108,6 +125,7 @@ class Converter(dg.Layer): Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform. """ + def __init__(self, n_speakers, speaker_dim, @@ -161,33 +179,36 @@ class Converter(dg.Layer): std = np.sqrt(std_mul / in_channels) # CAUTION: relu self.convolutions.append( - Conv1D(in_channels, - out_channels, - 1, - act="relu", - param_attr=I.Normal(scale=std))) + Conv1D( + in_channels, + out_channels, + 1, + act="relu", + param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( - Conv1DGLU(n_speakers, - speaker_dim, - in_channels, - out_channels, - filter_size, - dilation=dilation, - std_mul=std_mul, - dropout=dropout)) + Conv1DGLU( + n_speakers, + speaker_dim, + in_channels, + out_channels, + filter_size, + dilation=dilation, + std_mul=std_mul, + dropout=dropout)) in_channels = out_channels std_mul = 4.0 # final conv proj, channel transformed to linear dim std = np.sqrt(std_mul * (1 - dropout) / in_channels) # CAUTION: sigmoid - self.last_conv_proj = Conv1D(in_channels, - linear_dim, - 1, - act="sigmoid", - param_attr=I.Normal(scale=std)) + self.last_conv_proj = Conv1D( + in_channels, + linear_dim, + 1, + act="sigmoid", + param_attr=I.Normal(scale=std)) def forward(self, x, speaker_embed=None): """ @@ -229,4 +250,4 @@ class Converter(dg.Layer): out = self.last_conv_proj(x) out = F.transpose(out, [0, 2, 1]) - return out \ No newline at end of file + return out diff --git a/parakeet/models/deepvoice3/decoder.py b/parakeet/models/deepvoice3/decoder.py index 8e6a46b..7b7f581 100644 --- a/parakeet/models/deepvoice3/decoder.py +++ b/parakeet/models/deepvoice3/decoder.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import paddle.fluid.layers as F import paddle.fluid.initializer as I @@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r): class Decoder(dg.Layer): def __init__( - self, - n_speakers, - speaker_dim, - embed_dim, - mel_dim, - r=1, - max_positions=512, - padding_idx=None, # remove it! - preattention=(ConvSpec(128, 5, 1), ) * 4, - convolutions=(ConvSpec(128, 5, 1), ) * 4, - attention=True, - dropout=0.0, - use_memory_mask=False, - force_monotonic_attention=False, - query_position_rate=1.0, - key_position_rate=1.0, - window_range=WindowRange(-1, 3), - key_projection=True, - value_projection=True): + self, + n_speakers, + speaker_dim, + embed_dim, + mel_dim, + r=1, + max_positions=512, + padding_idx=None, # remove it! + preattention=(ConvSpec(128, 5, 1), ) * 4, + convolutions=(ConvSpec(128, 5, 1), ) * 4, + attention=True, + dropout=0.0, + use_memory_mask=False, + force_monotonic_attention=False, + query_position_rate=1.0, + key_position_rate=1.0, + window_range=WindowRange(-1, 3), + key_projection=True, + value_projection=True): super(Decoder, self).__init__() self.dropout = dropout @@ -111,23 +125,17 @@ class Decoder(dg.Layer): conv_channels = convolutions[0].out_channels # only when padding idx is 0 can we easilt handle it - self.embed_keys_positions = PositionEmbedding(max_positions, - embed_dim, - padding_idx=0) - self.embed_query_positions = PositionEmbedding(max_positions, - conv_channels, - padding_idx=0) + self.embed_keys_positions = PositionEmbedding( + max_positions, embed_dim, padding_idx=0) + self.embed_query_positions = PositionEmbedding( + max_positions, conv_channels, padding_idx=0) if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) - self.speaker_proj1 = Linear(speaker_dim, - 1, - act="sigmoid", - param_attr=I.Normal(scale=std)) - self.speaker_proj2 = Linear(speaker_dim, - 1, - act="sigmoid", - param_attr=I.Normal(scale=std)) + self.speaker_proj1 = Linear( + speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std)) + self.speaker_proj2 = Linear( + speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std)) # prenet self.prenet = dg.LayerList() @@ -138,24 +146,26 @@ class Decoder(dg.Layer): # conv1d & relu std = np.sqrt(std_mul / in_channels) self.prenet.append( - Conv1D(in_channels, - out_channels, - 1, - act="relu", - param_attr=I.Normal(scale=std))) + Conv1D( + in_channels, + out_channels, + 1, + act="relu", + param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.prenet.append( - Conv1DGLU(n_speakers, - speaker_dim, - in_channels, - out_channels, - filter_size, - dilation, - std_mul, - dropout, - causal=True, - residual=True)) + Conv1DGLU( + n_speakers, + speaker_dim, + in_channels, + out_channels, + filter_size, + dilation, + std_mul, + dropout, + causal=True, + residual=True)) in_channels = out_channels std_mul = 4.0 @@ -184,16 +194,17 @@ class Decoder(dg.Layer): assert ( in_channels == out_channels ), "the stack of convolution & attention does not change channels" - conv_layer = Conv1DGLU(n_speakers, - speaker_dim, - in_channels, - out_channels, - filter_size, - dilation, - std_mul, - dropout, - causal=True, - residual=False) + conv_layer = Conv1DGLU( + n_speakers, + speaker_dim, + in_channels, + out_channels, + filter_size, + dilation, + std_mul, + dropout, + causal=True, + residual=False) attn_layer = Attention( out_channels, embed_dim, @@ -211,10 +222,8 @@ class Decoder(dg.Layer): # 1 * 1 conv to transform channels std = np.sqrt(std_mul * (1 - dropout) / in_channels) - self.last_conv = Conv1D(in_channels, - mel_dim * r, - 1, - param_attr=I.Normal(scale=std)) + self.last_conv = Conv1D( + in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std)) # mel (before sigmoid) to done hat std = np.sqrt(1 / in_channels) @@ -308,9 +317,8 @@ class Decoder(dg.Layer): # (B, C, T) frames = F.transpose(frames, [0, 2, 1]) x = frames - x = F.dropout(x, - self.dropout, - dropout_implementation="upscale_in_train") + x = F.dropout( + x, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: if isinstance(layer, Conv1DGLU): @@ -408,14 +416,13 @@ class Decoder(dg.Layer): test_inputs = fold_adjacent_frames(test_inputs, self.r) test_inputs = F.transpose(test_inputs, [0, 2, 1]) - initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1), - dtype=keys.dtype) + initial_input = F.zeros( + (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype) t = 0 # decoder time step while True: - frame_pos = F.fill_constant((batch_size, 1), - value=t + 1, - dtype="int64") + frame_pos = F.fill_constant( + (batch_size, 1), value=t + 1, dtype="int64") w = self.query_position_rate if self.n_speakers > 1: w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1]) @@ -433,9 +440,8 @@ class Decoder(dg.Layer): current_input = initial_input x_t = current_input - x_t = F.dropout(x_t, - self.dropout, - dropout_implementation="upscale_in_train") + x_t = F.dropout( + x_t, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: @@ -453,15 +459,15 @@ class Decoder(dg.Layer): x_t = F.transpose(x_t, [0, 2, 1]) if frame_pos_embed is not None: x_t += frame_pos_embed - x_t, attn_scores = attn( - x_t, (keys, values), mask, - last_attended[i] if test_inputs is None else None) + x_t, attn_scores = attn(x_t, (keys, values), mask, + last_attended[i] + if test_inputs is None else None) x_t = F.transpose(x_t, [0, 2, 1]) step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc) # update last attended when necessary if self.force_monotonic_attention[i]: - last_attended[i] = np.argmax(attn_scores.numpy(), - axis=-1)[0][0] + last_attended[i] = np.argmax( + attn_scores.numpy(), axis=-1)[0][0] x_t = F.scale(residual + x_t, np.sqrt(0.5)) if len(step_attn_scores): # (B, 1, T_enc) again @@ -485,8 +491,8 @@ class Decoder(dg.Layer): t += 1 if test_inputs is None: - if F.reduce_min(done_t).numpy( - )[0] > 0.5 and t > self.min_decoder_steps: + if F.reduce_min(done_t).numpy()[ + 0] > 0.5 and t > self.min_decoder_steps: break elif t > self.max_decoder_steps: break diff --git a/parakeet/models/deepvoice3/encoder.py b/parakeet/models/deepvoice3/encoder.py index ebcd62f..b3e8bfb 100644 --- a/parakeet/models/deepvoice3/encoder.py +++ b/parakeet/models/deepvoice3/encoder.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from collections import namedtuple @@ -33,14 +47,16 @@ class Encoder(dg.Layer): self.dropout = dropout if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) - self.sp_proj1 = Linear(speaker_dim, - embed_dim, - act="softsign", - param_attr=I.Normal(scale=std)) - self.sp_proj2 = Linear(speaker_dim, - embed_dim, - act="softsign", - param_attr=I.Normal(scale=std)) + self.sp_proj1 = Linear( + speaker_dim, + embed_dim, + act="softsign", + param_attr=I.Normal(scale=std)) + self.sp_proj2 = Linear( + speaker_dim, + embed_dim, + act="softsign", + param_attr=I.Normal(scale=std)) self.n_speakers = n_speakers self.convolutions = dg.LayerList() @@ -51,31 +67,34 @@ class Encoder(dg.Layer): if in_channels != out_channels: std = np.sqrt(std_mul / in_channels) self.convolutions.append( - Conv1D(in_channels, - out_channels, - 1, - act="relu", - param_attr=I.Normal(scale=std))) + Conv1D( + in_channels, + out_channels, + 1, + act="relu", + param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( - Conv1DGLU(n_speakers, - speaker_dim, - in_channels, - out_channels, - filter_size, - dilation, - std_mul, - dropout, - causal=False, - residual=True)) + Conv1DGLU( + n_speakers, + speaker_dim, + in_channels, + out_channels, + filter_size, + dilation, + std_mul, + dropout, + causal=False, + residual=True)) in_channels = out_channels std_mul = 4.0 std = np.sqrt(std_mul * (1 - dropout) / in_channels) self.convolutions.append( - Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std))) + Conv1D( + in_channels, embed_dim, 1, param_attr=I.Normal(scale=std))) def forward(self, x, speaker_embed=None): """ @@ -96,9 +115,8 @@ class Encoder(dg.Layer): representation for values. """ x = self.embed(x) - x = F.dropout(x, - self.dropout, - dropout_implementation="upscale_in_train") + x = F.dropout( + x, self.dropout, dropout_implementation="upscale_in_train") x = F.transpose(x, [0, 2, 1]) if self.n_speakers > 1 and speaker_embed is not None: diff --git a/parakeet/models/deepvoice3/loss.py b/parakeet/models/deepvoice3/loss.py index 86412e7..be6f0bd 100644 --- a/parakeet/models/deepvoice3/loss.py +++ b/parakeet/models/deepvoice3/loss.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from numba import jit @@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g): return W -def guided_attentions(encoder_lengths, - decoder_lengths, - max_decoder_len, +def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len, g=0.2): B = len(encoder_lengths) max_input_len = encoder_lengths.max() @@ -93,9 +105,8 @@ class TTSLoss(object): def binary_divergence(self, prediction, target, mask): flattened_prediction = F.reshape(prediction, [-1, 1]) flattened_target = F.reshape(target, [-1, 1]) - flattened_loss = F.log_loss(flattened_prediction, - flattened_target, - epsilon=1e-8) + flattened_loss = F.log_loss( + flattened_prediction, flattened_target, epsilon=1e-8) bin_div = fluid.layers.reshape(flattened_loss, prediction.shape) w = self.masked_weight @@ -163,23 +174,20 @@ class TTSLoss(object): max_mel_steps = max_frames // self.downsample_factor max_decoder_steps = max_mel_steps // self.r - decoder_mask = F.sequence_mask(n_frames // self.downsample_factor // - self.r, - max_decoder_steps, - dtype="float32") - mel_mask = F.sequence_mask(n_frames // self.downsample_factor, - max_mel_steps, - dtype="float32") + decoder_mask = F.sequence_mask( + n_frames // self.downsample_factor // self.r, + max_decoder_steps, + dtype="float32") + mel_mask = F.sequence_mask( + n_frames // self.downsample_factor, max_mel_steps, dtype="float32") lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32") if compute_lin_loss: lin_hyp = lin_hyp[:, :-self.time_shift, :] lin_ref = lin_ref[:, self.time_shift:, :] lin_mask = lin_mask[:, self.time_shift:, :] - lin_l1_loss = self.l1_loss(lin_hyp, - lin_ref, - lin_mask, - priority_bin=self.priority_bin) + lin_l1_loss = self.l1_loss( + lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin) lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask) lin_loss = self.binary_divergence_weight * lin_bce_loss \ + (1 - self.binary_divergence_weight) * lin_l1_loss @@ -197,9 +205,10 @@ class TTSLoss(object): total_loss += mel_loss if compute_attn_loss: - attn_loss = self.attention_loss( - attn_hyp, input_lengths.numpy(), - n_frames.numpy() // (self.downsample_factor * self.r)) + attn_loss = self.attention_loss(attn_hyp, + input_lengths.numpy(), + n_frames.numpy() // + (self.downsample_factor * self.r)) total_loss += attn_loss if compute_done_loss: diff --git a/parakeet/models/deepvoice3/model.py b/parakeet/models/deepvoice3/model.py index 57c3fcf..f2fb271 100644 --- a/parakeet/models/deepvoice3/model.py +++ b/parakeet/models/deepvoice3/model.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import paddle.fluid.layers as F @@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer): mel_outputs, alignments, done, decoder_states = self.decoder( (keys, values), valid_lengths, mel_inputs, text_positions, frame_positions, speaker_embed) - linear_outputs = self.converter( - decoder_states if self.use_decoder_states else mel_outputs, - speaker_embed) + linear_outputs = self.converter(decoder_states + if self.use_decoder_states else + mel_outputs, speaker_embed) return mel_outputs, linear_outputs, alignments, done def transduce(self, text_sequences, text_positions, speaker_indices=None): @@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer): keys, values = self.encoder(text_sequences, speaker_embed) mel_outputs, alignments, done, decoder_states = self.decoder.decode( (keys, values), text_positions, speaker_embed) - linear_outputs = self.converter( - decoder_states if self.use_decoder_states else mel_outputs, - speaker_embed) + linear_outputs = self.converter(decoder_states + if self.use_decoder_states else + mel_outputs, speaker_embed) return mel_outputs, linear_outputs, alignments, done diff --git a/parakeet/models/deepvoice3/position_embedding.py b/parakeet/models/deepvoice3/position_embedding.py index aefb00c..88ef5cb 100644 --- a/parakeet/models/deepvoice3/position_embedding.py +++ b/parakeet/models/deepvoice3/position_embedding.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from paddle import fluid import paddle.fluid.layers as F @@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer): speaker_position_rate) # (B, V, C) # make indices for gather_nd batch_id = F.expand( - F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]), - [1, time_steps]) + F.unsqueeze( + F.range( + 0, batch_size, 1, dtype="int64"), [1]), [1, time_steps]) # (B, T, 2) gather_nd_id = F.stack([batch_id, indices], -1) out = F.gather_nd(weight, gather_nd_id) - return out \ No newline at end of file + return out diff --git a/parakeet/models/fastspeech/__init__.py b/parakeet/models/fastspeech/__init__.py index e69de29..131e065 100644 --- a/parakeet/models/fastspeech/__init__.py +++ b/parakeet/models/fastspeech/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/parakeet/models/fastspeech/decoder.py b/parakeet/models/fastspeech/decoder.py index 732fed4..46eb391 100644 --- a/parakeet/models/fastspeech/decoder.py +++ b/parakeet/models/fastspeech/decoder.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.models.transformer_tts.utils import * from parakeet.models.fastspeech.fft_block import FFTBlock + class Decoder(dg.Layer): def __init__(self, len_max_seq, @@ -18,16 +32,29 @@ class Decoder(dg.Layer): super(Decoder, self).__init__() n_position = len_max_seq + 1 - self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) - self.position_enc = dg.Embedding(size=[n_position, d_model], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + self.pos_inp = get_sinusoid_encoding_table( + n_position, d_model, padding_idx=0) + self.position_enc = dg.Embedding( + size=[n_position, d_model], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + self.pos_inp), + trainable=False)) + self.layer_stack = [ + FFTBlock( + d_model, + d_inner, + n_head, + d_k, + d_v, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=dropout) for _ in range(n_layers) + ] for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer) - + def forward(self, enc_seq, enc_pos): """ Decoder layer of FastSpeech. @@ -57,4 +84,4 @@ class Decoder(dg.Layer): slf_attn_mask=slf_attn_mask) dec_slf_attn_list += [dec_slf_attn] - return dec_output, dec_slf_attn_list \ No newline at end of file + return dec_output, dec_slf_attn_list diff --git a/parakeet/models/fastspeech/encoder.py b/parakeet/models/fastspeech/encoder.py index ac96e39..15c8d60 100644 --- a/parakeet/models/fastspeech/encoder.py +++ b/parakeet/models/fastspeech/encoder.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.models.transformer_tts.utils import * from parakeet.models.fastspeech.fft_block import FFTBlock + class Encoder(dg.Layer): def __init__(self, n_src_vocab, @@ -19,14 +33,28 @@ class Encoder(dg.Layer): super(Encoder, self).__init__() n_position = len_max_seq + 1 - self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0) - self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) - self.position_enc = dg.Embedding(size=[n_position, d_model], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] + self.src_word_emb = dg.Embedding( + size=[n_src_vocab, d_model], padding_idx=0) + self.pos_inp = get_sinusoid_encoding_table( + n_position, d_model, padding_idx=0) + self.position_enc = dg.Embedding( + size=[n_position, d_model], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + self.pos_inp), + trainable=False)) + self.layer_stack = [ + FFTBlock( + d_model, + d_inner, + n_head, + d_k, + d_v, + fft_conv1d_kernel, + fft_conv1d_padding, + dropout=dropout) for _ in range(n_layers) + ] for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer) @@ -52,7 +80,8 @@ class Encoder(dg.Layer): non_pad_mask = get_non_pad_mask(character) # -- Forward - enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) + enc_output = self.src_word_emb(character) + self.position_enc( + text_pos) #(N, T, C) for enc_layer in self.layer_stack: enc_output, enc_slf_attn = enc_layer( @@ -60,5 +89,5 @@ class Encoder(dg.Layer): non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask) enc_slf_attn_list += [enc_slf_attn] - - return enc_output, non_pad_mask, enc_slf_attn_list \ No newline at end of file + + return enc_output, non_pad_mask, enc_slf_attn_list diff --git a/parakeet/models/fastspeech/fastspeech.py b/parakeet/models/fastspeech/fastspeech.py index 4a01b95..91478af 100644 --- a/parakeet/models/fastspeech/fastspeech.py +++ b/parakeet/models/fastspeech/fastspeech.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid @@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.decoder import Decoder + class FastSpeech(dg.Layer): def __init__(self, cfg): " FastSpeech" super(FastSpeech, self).__init__() - self.encoder = Encoder(n_src_vocab=len(symbols)+1, - len_max_seq=cfg['max_seq_len'], - n_layers=cfg['encoder_n_layer'], - n_head=cfg['encoder_head'], - d_k=cfg['fs_hidden_size'] // cfg['encoder_head'], - d_v=cfg['fs_hidden_size'] // cfg['encoder_head'], - d_model=cfg['fs_hidden_size'], - d_inner=cfg['encoder_conv1d_filter_size'], - fft_conv1d_kernel=cfg['fft_conv1d_filter'], - fft_conv1d_padding=cfg['fft_conv1d_padding'], - dropout=0.1) - self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], - out_channels=cfg['duration_predictor_output_size'], - filter_size=cfg['duration_predictor_filter_size'], - dropout=cfg['dropout']) - self.decoder = Decoder(len_max_seq=cfg['max_seq_len'], - n_layers=cfg['decoder_n_layer'], - n_head=cfg['decoder_head'], - d_k=cfg['fs_hidden_size'] // cfg['decoder_head'], - d_v=cfg['fs_hidden_size'] // cfg['decoder_head'], - d_model=cfg['fs_hidden_size'], - d_inner=cfg['decoder_conv1d_filter_size'], - fft_conv1d_kernel=cfg['fft_conv1d_filter'], - fft_conv1d_padding=cfg['fft_conv1d_padding'], - dropout=0.1) - self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) + self.encoder = Encoder( + n_src_vocab=len(symbols) + 1, + len_max_seq=cfg['max_seq_len'], + n_layers=cfg['encoder_n_layer'], + n_head=cfg['encoder_head'], + d_k=cfg['fs_hidden_size'] // cfg['encoder_head'], + d_v=cfg['fs_hidden_size'] // cfg['encoder_head'], + d_model=cfg['fs_hidden_size'], + d_inner=cfg['encoder_conv1d_filter_size'], + fft_conv1d_kernel=cfg['fft_conv1d_filter'], + fft_conv1d_padding=cfg['fft_conv1d_padding'], + dropout=0.1) + self.length_regulator = LengthRegulator( + input_size=cfg['fs_hidden_size'], + out_channels=cfg['duration_predictor_output_size'], + filter_size=cfg['duration_predictor_filter_size'], + dropout=cfg['dropout']) + self.decoder = Decoder( + len_max_seq=cfg['max_seq_len'], + n_layers=cfg['decoder_n_layer'], + n_head=cfg['decoder_head'], + d_k=cfg['fs_hidden_size'] // cfg['decoder_head'], + d_v=cfg['fs_hidden_size'] // cfg['decoder_head'], + d_model=cfg['fs_hidden_size'], + d_inner=cfg['decoder_conv1d_filter_size'], + fft_conv1d_kernel=cfg['fft_conv1d_filter'], + fft_conv1d_padding=cfg['fft_conv1d_padding'], + dropout=0.1) + self.weight = fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()) k = math.sqrt(1 / cfg['fs_hidden_size']) - self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) - self.mel_linear = dg.Linear(cfg['fs_hidden_size'], - cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'], - param_attr = self.weight, - bias_attr = self.bias,) - self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'], - num_hidden=512, - filter_size=5, - padding=int(5 / 2), - num_conv=5, - outputs_per_step=cfg['audio']['outputs_per_step'], - use_cudnn=True, - dropout=0.1, - batchnorm_last=True) + self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k)) + self.mel_linear = dg.Linear( + cfg['fs_hidden_size'], + cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'], + param_attr=self.weight, + bias_attr=self.bias, ) + self.postnet = PostConvNet( + n_mels=cfg['audio']['num_mels'], + num_hidden=512, + filter_size=5, + padding=int(5 / 2), + num_conv=5, + outputs_per_step=cfg['audio']['outputs_per_step'], + use_cudnn=True, + dropout=0.1, + batchnorm_last=True) - def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): + def forward(self, + character, + text_pos, + mel_pos=None, + length_target=None, + alpha=1.0): """ FastSpeech model. @@ -80,22 +106,25 @@ class FastSpeech(dg.Layer): dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. """ - encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) + encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder( + character, text_pos) if fluid.framework._dygraph_tracer()._train_mode: - - length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, - target=length_target, - alpha=alpha) - decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos) + + length_regulator_output, duration_predictor_output = self.length_regulator( + encoder_output, target=length_target, alpha=alpha) + decoder_output, dec_slf_attn_list = self.decoder( + length_regulator_output, mel_pos) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list else: - length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) - decoder_output, _ = self.decoder(length_regulator_output, decoder_pos) + length_regulator_output, decoder_pos = self.length_regulator( + encoder_output, alpha=alpha) + decoder_output, _ = self.decoder(length_regulator_output, + decoder_pos) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output - return mel_output, mel_output_postnet \ No newline at end of file + return mel_output, mel_output_postnet diff --git a/parakeet/models/fastspeech/fft_block.py b/parakeet/models/fastspeech/fft_block.py index ea86328..f50f11a 100644 --- a/parakeet/models/fastspeech/fft_block.py +++ b/parakeet/models/fastspeech/fft_block.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np import math import paddle.fluid.dygraph as dg @@ -6,11 +19,32 @@ import paddle.fluid as fluid from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward + class FFTBlock(dg.Layer): - def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): + def __init__(self, + d_model, + d_inner, + n_head, + d_k, + d_v, + filter_size, + padding, + dropout=0.2): super(FFTBlock, self).__init__() - self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False) - self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) + self.slf_attn = MultiheadAttention( + d_model, + d_k, + d_v, + num_head=n_head, + is_bias=True, + dropout=dropout, + is_concat=False) + self.pos_ffn = PositionwiseFeedForward( + d_model, + d_inner, + filter_size=filter_size, + padding=padding, + dropout=dropout) def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): """ @@ -27,10 +61,11 @@ class FFTBlock(dg.Layer): output (Variable), Shape(B, T, C), the output after self-attention & ffn. slf_attn (Variable), Shape(B * n_head, T, T), the self attention. """ - output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) + output, slf_attn = self.slf_attn( + enc_input, enc_input, enc_input, mask=slf_attn_mask) output *= non_pad_mask output = self.pos_ffn(output) output *= non_pad_mask - return output, slf_attn \ No newline at end of file + return output, slf_attn diff --git a/parakeet/models/fastspeech/length_regulator.py b/parakeet/models/fastspeech/length_regulator.py index d90eaa5..331597a 100644 --- a/parakeet/models/fastspeech/length_regulator.py +++ b/parakeet/models/fastspeech/length_regulator.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np import math import parakeet.models.fastspeech.utils @@ -6,47 +19,50 @@ import paddle.fluid.layers as layers import paddle.fluid as fluid from parakeet.modules.customized import Conv1D + class LengthRegulator(dg.Layer): def __init__(self, input_size, out_channels, filter_size, dropout=0.1): super(LengthRegulator, self).__init__() - self.duration_predictor = DurationPredictor(input_size=input_size, - out_channels=out_channels, - filter_size=filter_size, - dropout=dropout) + self.duration_predictor = DurationPredictor( + input_size=input_size, + out_channels=out_channels, + filter_size=filter_size, + dropout=dropout) def LR(self, x, duration_predictor_output, alpha=1.0): output = [] batch_size = x.shape[0] for i in range(batch_size): - output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha)) + output.append( + self.expand(x[i:i + 1], duration_predictor_output[i:i + 1], + alpha)) output = self.pad(output) return output - + def pad(self, input_ele): max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) out_list = [] for i in range(len(input_ele)): pad_len = max_len - input_ele[i].shape[0] - one_batch_padded = layers.pad( - input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) + one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0], + pad_value=0.0) out_list.append(one_batch_padded) out_padded = layers.stack(out_list) return out_padded - + def expand(self, batch, predicted, alpha): out = [] time_steps = batch.shape[1] fertilities = predicted.numpy() - batch = layers.squeeze(batch,[0]) - - + batch = layers.squeeze(batch, [0]) + for i in range(time_steps): - if fertilities[0,i]==0: + if fertilities[0, i] == 0: continue - out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1])) + out.append( + layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1])) out = layers.concat(out, axis=0) return out - def forward(self, x, alpha=1.0, target=None): """ @@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer): else: duration_predictor_output = layers.round(duration_predictor_output) output = self.LR(x, duration_predictor_output, alpha) - mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1)) + mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1)) mel_pos = layers.unsqueeze(mel_pos, [0]) return output, mel_pos + class DurationPredictor(dg.Layer): def __init__(self, input_size, out_channels, filter_size, dropout=0.1): super(DurationPredictor, self).__init__() @@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer): self.dropout = dropout k = math.sqrt(1 / self.input_size) - self.conv1 = Conv1D(num_channels = self.input_size, - num_filters = self.out_channels, - filter_size = self.filter_size, - padding=1, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) - #data_format='NTC') + self.conv1 = Conv1D( + num_channels=self.input_size, + num_filters=self.out_channels, + filter_size=self.filter_size, + padding=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + #data_format='NTC') k = math.sqrt(1 / self.out_channels) - self.conv2 = Conv1D(num_channels = self.out_channels, - num_filters = self.out_channels, - filter_size = self.filter_size, - padding=1, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) - #data_format='NTC') + self.conv2 = Conv1D( + num_channels=self.out_channels, + num_filters=self.out_channels, + filter_size=self.filter_size, + padding=1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + #data_format='NTC') self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels) - self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) + self.weight = fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()) k = math.sqrt(1 / self.out_channels) - self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) + self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k)) - self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight, - bias_attr = self.bias) + self.linear = dg.Linear( + self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias) def forward(self, encoder_output): """ @@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer): out (Variable), Shape(B, T, C), the output of duration predictor. """ # encoder_output.shape(N, T, C) - out = layers.transpose(encoder_output, [0,2,1]) + out = layers.transpose(encoder_output, [0, 2, 1]) out = self.conv1(out) - out = layers.transpose(out, [0,2,1]) + out = layers.transpose(out, [0, 2, 1]) out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) - out = layers.transpose(out, [0,2,1]) + out = layers.transpose(out, [0, 2, 1]) out = self.conv2(out) - out = layers.transpose(out, [0,2,1]) + out = layers.transpose(out, [0, 2, 1]) out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) out = layers.relu(self.linear(out)) out = layers.squeeze(out, axes=[-1]) - - - return out - + return out diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py index a94de8d..5e680f0 100644 --- a/parakeet/models/fastspeech/utils.py +++ b/parakeet/models/fastspeech/utils.py @@ -1,5 +1,19 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np + def get_alignment(attn_probs, mel_lens, n_head): max_F = 0 assert attn_probs[0].shape[0] % n_head == 0 @@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head): for i in range(len(attn_probs)): multi_attn = attn_probs[i].numpy() for j in range(n_head): - attn = multi_attn[j*batch_size:(j+1)*batch_size] + attn = multi_attn[j * batch_size:(j + 1) * batch_size] F = score_F(attn) if max_F < F: max_F = F max_attn = attn alignment = compute_duration(max_attn, mel_lens) return alignment - + + def score_F(attn): max = np.max(attn, axis=-1) mean = np.mean(max) return mean + def compute_duration(attn, mel_lens): - alignment = np.zeros([attn.shape[0],attn.shape[2]]) + alignment = np.zeros([attn.shape[0], attn.shape[2]]) mel_lens = mel_lens.numpy() for i in range(attn.shape[0]): for j in range(mel_lens[i]): - max_index = np.argmax(attn[i,j]) - alignment[i,max_index] += 1 + max_index = np.argmax(attn[i, j]) + alignment[i, max_index] += 1 return alignment - - diff --git a/parakeet/models/transformer_tts/__init__.py b/parakeet/models/transformer_tts/__init__.py index e69de29..131e065 100644 --- a/parakeet/models/transformer_tts/__init__.py +++ b/parakeet/models/transformer_tts/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/parakeet/models/transformer_tts/cbhg.py b/parakeet/models/transformer_tts/cbhg.py index 94b907f..ca93536 100644 --- a/parakeet/models/transformer_tts/cbhg.py +++ b/parakeet/models/transformer_tts/cbhg.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math from parakeet.g2p.text.symbols import symbols import paddle.fluid.dygraph as dg @@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D from parakeet.modules.dynamic_gru import DynamicGRU import numpy as np + class CBHG(dg.Layer): - def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, - max_pool_kernel_size=2, is_post=False): + def __init__(self, + hidden_size, + batch_size, + K=16, + projection_size=256, + num_gru_layers=2, + max_pool_kernel_size=2, + is_post=False): super(CBHG, self).__init__() """ :param hidden_size: dimension of hidden unit @@ -24,28 +44,39 @@ class CBHG(dg.Layer): self.projection_size = projection_size self.conv_list = [] k = math.sqrt(1 / projection_size) - self.conv_list.append(Conv1D(num_channels = projection_size, - num_filters = hidden_size, - filter_size = 1, - padding = int(np.floor(1/2)), - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) + self.conv_list.append( + Conv1D( + num_channels=projection_size, + num_filters=hidden_size, + filter_size=1, + padding=int(np.floor(1 / 2)), + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)))) k = math.sqrt(1 / hidden_size) - for i in range(2,K+1): - self.conv_list.append(Conv1D(num_channels = hidden_size, - num_filters = hidden_size, - filter_size = i, - padding = int(np.floor(i/2)), - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) + for i in range(2, K + 1): + self.conv_list.append( + Conv1D( + num_channels=hidden_size, + num_filters=hidden_size, + filter_size=i, + padding=int(np.floor(i / 2)), + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)))) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) self.batchnorm_list = [] for i in range(K): - self.batchnorm_list.append(dg.BatchNorm(hidden_size, - data_layout='NCHW')) + self.batchnorm_list.append( + dg.BatchNorm( + hidden_size, data_layout='NCHW')) for i, layer in enumerate(self.batchnorm_list): self.add_sublayer("batchnorm_list_{}".format(i), layer) @@ -53,91 +84,120 @@ class CBHG(dg.Layer): conv_outdim = hidden_size * K k = math.sqrt(1 / conv_outdim) - self.conv_projection_1 = Conv1D(num_channels = conv_outdim, - num_filters = hidden_size, - filter_size = 3, - padding = int(np.floor(3/2)), - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) + self.conv_projection_1 = Conv1D( + num_channels=conv_outdim, + num_filters=hidden_size, + filter_size=3, + padding=int(np.floor(3 / 2)), + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) k = math.sqrt(1 / hidden_size) - self.conv_projection_2 = Conv1D(num_channels = hidden_size, - num_filters = projection_size, - filter_size = 3, - padding = int(np.floor(3/2)), - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) + self.conv_projection_2 = Conv1D( + num_channels=hidden_size, + num_filters=projection_size, + filter_size=3, + padding=int(np.floor(3 / 2)), + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) - self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, - data_layout='NCHW') - self.batchnorm_proj_2 = dg.BatchNorm(projection_size, - data_layout='NCHW') - self.max_pool = Pool1D(pool_size = max_pool_kernel_size, - pool_type='max', - pool_stride=1, - pool_padding=1, - data_format = "NCT") + self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW') + self.batchnorm_proj_2 = dg.BatchNorm( + projection_size, data_layout='NCHW') + self.max_pool = Pool1D( + pool_size=max_pool_kernel_size, + pool_type='max', + pool_stride=1, + pool_padding=1, + data_format="NCT") self.highway = Highwaynet(self.projection_size) h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = dg.to_variable(h_0) k = math.sqrt(1 / hidden_size) - self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) - self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) - self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, - is_reverse = False, - origin_mode = True, - h_0 = h_0) - self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, - is_reverse=True, - origin_mode=True, - h_0 = h_0) + self.fc_forward1 = dg.Linear( + hidden_size, + hidden_size // 2 * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + self.fc_reverse1 = dg.Linear( + hidden_size, + hidden_size // 2 * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + self.gru_forward1 = DynamicGRU( + size=self.hidden_size // 2, + is_reverse=False, + origin_mode=True, + h_0=h_0) + self.gru_reverse1 = DynamicGRU( + size=self.hidden_size // 2, + is_reverse=True, + origin_mode=True, + h_0=h_0) - self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) - self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) - self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, - is_reverse = False, - origin_mode = True, - h_0 = h_0) - self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, - is_reverse=True, - origin_mode=True, - h_0 = h_0) + self.fc_forward2 = dg.Linear( + hidden_size, + hidden_size // 2 * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + self.fc_reverse2 = dg.Linear( + hidden_size, + hidden_size // 2 * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + self.gru_forward2 = DynamicGRU( + size=self.hidden_size // 2, + is_reverse=False, + origin_mode=True, + h_0=h_0) + self.gru_reverse2 = DynamicGRU( + size=self.hidden_size // 2, + is_reverse=True, + origin_mode=True, + h_0=h_0) def _conv_fit_dim(self, x, filter_size=3): if filter_size % 2 == 0: - return x[:,:,:-1] + return x[:, :, :-1] else: - return x + return x def forward(self, input_): # input_.shape = [N, C, T] conv_list = [] conv_input = input_ - - for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): - conv_input = self._conv_fit_dim(conv(conv_input), i+1) + + for i, (conv, batchnorm + ) in enumerate(zip(self.conv_list, self.batchnorm_list)): + conv_input = self._conv_fit_dim(conv(conv_input), i + 1) conv_input = layers.relu(batchnorm(conv_input)) conv_list.append(conv_input) - + conv_cat = layers.concat(conv_list, axis=1) - conv_pool = self.max_pool(conv_cat)[:,:,:-1] - - - conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) - conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ - + conv_pool = self.max_pool(conv_cat)[:, :, :-1] + + conv_proj = layers.relu( + self.batchnorm_proj_1( + self._conv_fit_dim(self.conv_projection_1(conv_pool)))) + conv_proj = self.batchnorm_proj_2( + self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ + # conv_proj.shape = [N, C, T] - highway = layers.transpose(conv_proj, [0,2,1]) + highway = layers.transpose(conv_proj, [0, 2, 1]) highway = self.highway(highway) # highway.shape = [N, T, C] @@ -151,9 +211,10 @@ class CBHG(dg.Layer): out_forward = self.gru_forward2(fc_forward) out_reverse = self.gru_reverse2(fc_reverse) out = layers.concat([out_forward, out_reverse], axis=-1) - out = layers.transpose(out, [0,2,1]) + out = layers.transpose(out, [0, 2, 1]) return out + class Highwaynet(dg.Layer): def __init__(self, num_units, num_layers=4): super(Highwaynet, self).__init__() @@ -164,14 +225,26 @@ class Highwaynet(dg.Layer): self.linears = [] k = math.sqrt(1 / num_units) for i in range(num_layers): - self.linears.append(dg.Linear(num_units, num_units, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))) - self.gates.append(dg.Linear(num_units, num_units, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))) - - for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): + self.linears.append( + dg.Linear( + num_units, + num_units, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)))) + self.gates.append( + dg.Linear( + num_units, + num_units, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)))) + + for i, (linear, gate) in enumerate(zip(self.linears, self.gates)): self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("gates_{}".format(i), gate) @@ -183,12 +256,6 @@ class Highwaynet(dg.Layer): t_ = fluid.layers.sigmoid(gate(out)) c = 1 - t_ - out = h * t_ + out * c - + out = h * t_ + out * c + return out - - - - - - diff --git a/parakeet/models/transformer_tts/decoder.py b/parakeet/models/transformer_tts/decoder.py index b0da788..3d7adf1 100644 --- a/parakeet/models/transformer_tts/decoder.py +++ b/parakeet/models/transformer_tts/decoder.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid @@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.models.transformer_tts.prenet import PreNet from parakeet.models.transformer_tts.post_convnet import PostConvNet + class Decoder(dg.Layer): def __init__(self, num_hidden, config, num_head=4): super(Decoder, self).__init__() self.num_hidden = num_hidden param = fluid.ParamAttr() - self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', - default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) - self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(size=[1024, num_hidden], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], - hidden_size = num_hidden * 2, - output_size = num_hidden, - dropout_rate=0.2) + self.alpha = self.create_parameter( + shape=(1, ), + attr=param, + dtype='float32', + default_initializer=fluid.initializer.ConstantInitializer( + value=1.0)) + self.pos_inp = get_sinusoid_encoding_table( + 1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding( + size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + self.pos_inp), + trainable=False)) + self.decoder_prenet = PreNet( + input_size=config['audio']['num_mels'], + hidden_size=num_hidden * 2, + output_size=num_hidden, + dropout_rate=0.2) k = math.sqrt(1 / num_hidden) - self.linear = dg.Linear(num_hidden, num_hidden, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + self.linear = dg.Linear( + num_hidden, + num_hidden, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) - self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] + self.selfattn_layers = [ + MultiheadAttention(num_hidden, num_hidden // num_head, + num_hidden // num_head) for _ in range(3) + ] for i, layer in enumerate(self.selfattn_layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] + self.attn_layers = [ + MultiheadAttention(num_hidden, num_hidden // num_head, + num_hidden // num_head) for _ in range(3) + ] for i, layer in enumerate(self.attn_layers): self.add_sublayer("attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] + self.ffns = [ + PositionwiseFeedForward( + num_hidden, num_hidden * num_head, filter_size=1) + for _ in range(3) + ] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'], - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) - self.stop_linear = dg.Linear(num_hidden, 1, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + self.mel_linear = dg.Linear( + num_hidden, + config['audio']['num_mels'] * config['audio']['outputs_per_step'], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) + self.stop_linear = dg.Linear( + num_hidden, + 1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) - self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], - filter_size = 5, padding = 4, num_conv=5, - outputs_per_step=config['audio']['outputs_per_step'], - use_cudnn = True) + self.postconvnet = PostConvNet( + config['audio']['num_mels'], + config['hidden_size'], + filter_size=5, + padding=4, + num_conv=5, + outputs_per_step=config['audio']['outputs_per_step'], + use_cudnn=True) def forward(self, key, value, query, c_mask, positional): # get decoder mask with triangular matrix - + if fluid.framework._dygraph_tracer()._train_mode: m_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query) - triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) + mask = get_attn_key_pad_mask((positional == 0).astype(np.float32), + query) + triu_tensor = dg.to_variable( + get_triu_tensor(query.numpy(), query.numpy())).astype( + np.float32) mask = mask + triu_tensor mask = fluid.layers.cast(mask == 0, np.float32) - + # (batch_size, decoder_len, encoder_len) - zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) + zero_mask = get_attn_key_pad_mask( + layers.squeeze(c_mask, [-1]), query) else: - mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) + mask = get_triu_tensor(query.numpy(), + query.numpy()).astype(np.float32) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) m_mask, zero_mask = None, None # Decoder pre-network query = self.decoder_prenet(query) - + # Centered position query = self.linear(query) @@ -84,10 +137,13 @@ class Decoder(dg.Layer): # Attention decoder-decoder, encoder-decoder selfattn_list = list() attn_list = list() - - for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): - query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) - query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) + + for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, + self.ffns): + query, attn_dec = selfattn( + query, query, query, mask=mask, query_mask=m_mask) + query, attn_dot = attn( + key, value, query, mask=zero_mask, query_mask=m_mask) query = ffn(query) selfattn_list.append(attn_dec) attn_list.append(attn_dot) @@ -96,7 +152,7 @@ class Decoder(dg.Layer): # Post Mel Network out = self.postconvnet(mel_out) out = mel_out + out - + # Stop tokens stop_tokens = self.stop_linear(query) stop_tokens = layers.squeeze(stop_tokens, [-1]) diff --git a/parakeet/models/transformer_tts/encoder.py b/parakeet/models/transformer_tts/encoder.py index 8cd37b2..548ea8e 100644 --- a/parakeet/models/transformer_tts/encoder.py +++ b/parakeet/models/transformer_tts/encoder.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.models.transformer_tts.utils import * @@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet + class Encoder(dg.Layer): def __init__(self, embedding_size, num_hidden, num_head=4): super(Encoder, self).__init__() self.num_hidden = num_hidden - param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) - self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') - self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) - self.pos_emb = dg.Embedding(size=[1024, num_hidden], - padding_idx=0, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), - trainable=False)) - self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, - num_hidden = num_hidden, - use_cudnn=True) - self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] + param = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=1.0)) + self.alpha = self.create_parameter( + shape=(1, ), attr=param, dtype='float32') + self.pos_inp = get_sinusoid_encoding_table( + 1024, self.num_hidden, padding_idx=0) + self.pos_emb = dg.Embedding( + size=[1024, num_hidden], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + self.pos_inp), + trainable=False)) + self.encoder_prenet = EncoderPrenet( + embedding_size=embedding_size, + num_hidden=num_hidden, + use_cudnn=True) + self.layers = [ + MultiheadAttention(num_hidden, num_hidden // num_head, + num_hidden // num_head) for _ in range(3) + ] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) - self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)] + self.ffns = [ + PositionwiseFeedForward( + num_hidden, + num_hidden * num_head, + filter_size=1, + use_cudnn=True) for _ in range(3) + ] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) @@ -33,25 +62,23 @@ class Encoder(dg.Layer): mask = get_attn_key_pad_mask(positional, x) else: query_mask, mask = None, None - + # Encoder pre_network - x = self.encoder_prenet(x) #(N,T,C) - - + x = self.encoder_prenet(x) #(N,T,C) + # Get positional encoding - positional = self.pos_emb(positional) - - x = positional * self.alpha + x #(N, T, C) - + positional = self.pos_emb(positional) + + x = positional * self.alpha + x #(N, T, C) # Positional dropout x = layers.dropout(x, 0.1) - + # Self attention encoder attentions = list() for layer, ffn in zip(self.layers, self.ffns): - x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) + x, attention = layer(x, x, x, mask=mask, query_mask=query_mask) x = ffn(x) attentions.append(attention) - return x, query_mask, attentions \ No newline at end of file + return x, query_mask, attentions diff --git a/parakeet/models/transformer_tts/encoderprenet.py b/parakeet/models/transformer_tts/encoderprenet.py index b27f2fe..d701424 100644 --- a/parakeet/models/transformer_tts/encoderprenet.py +++ b/parakeet/models/transformer_tts/encoderprenet.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math from parakeet.g2p.text.symbols import symbols import paddle.fluid.dygraph as dg @@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer): self.embedding_size = embedding_size self.num_hidden = num_hidden self.use_cudnn = use_cudnn - self.embedding = dg.Embedding( size = [len(symbols), embedding_size], - padding_idx = None) + self.embedding = dg.Embedding( + size=[len(symbols), embedding_size], padding_idx=None) self.conv_list = [] k = math.sqrt(1 / embedding_size) - self.conv_list.append(Conv1D(num_channels = embedding_size, - num_filters = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn)) + self.conv_list.append( + Conv1D( + num_channels=embedding_size, + num_filters=num_hidden, + filter_size=5, + padding=int(np.floor(5 / 2)), + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn)) k = math.sqrt(1 / num_hidden) for _ in range(2): - self.conv_list.append(Conv1D(num_channels = num_hidden, - num_filters = num_hidden, - filter_size = 5, - padding = int(np.floor(5/2)), - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn)) + self.conv_list.append( + Conv1D( + num_channels=num_hidden, + num_filters=num_hidden, + filter_size=5, + padding=int(np.floor(5 / 2)), + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn)) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) - self.batch_norm_list = [dg.BatchNorm(num_hidden, - data_layout='NCHW') for _ in range(3)] + self.batch_norm_list = [ + dg.BatchNorm( + num_hidden, data_layout='NCHW') for _ in range(3) + ] for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) k = math.sqrt(1 / num_hidden) - self.projection = dg.Linear(num_hidden, num_hidden, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + self.projection = dg.Linear( + num_hidden, + num_hidden, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) def forward(self, x): - x = self.embedding(x) #(batch_size, seq_len, embending_size) - x = layers.transpose(x,[0,2,1]) + x = self.embedding(x) #(batch_size, seq_len, embending_size) + x = layers.transpose(x, [0, 2, 1]) for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) - x = layers.transpose(x,[0,2,1]) #(N,T,C) + x = layers.transpose(x, [0, 2, 1]) #(N,T,C) x = self.projection(x) - return x \ No newline at end of file + return x diff --git a/parakeet/models/transformer_tts/post_convnet.py b/parakeet/models/transformer_tts/post_convnet.py index 3e393ee..8882e79 100644 --- a/parakeet/models/transformer_tts/post_convnet.py +++ b/parakeet/models/transformer_tts/post_convnet.py @@ -1,11 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers from parakeet.modules.customized import Conv1D + class PostConvNet(dg.Layer): - def __init__(self, + def __init__(self, n_mels=80, num_hidden=512, filter_size=5, @@ -16,49 +30,66 @@ class PostConvNet(dg.Layer): dropout=0.1, batchnorm_last=False): super(PostConvNet, self).__init__() - + self.dropout = dropout self.num_conv = num_conv self.batchnorm_last = batchnorm_last self.conv_list = [] k = math.sqrt(1 / (n_mels * outputs_per_step)) - self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step, - num_filters = num_hidden, - filter_size = filter_size, - padding = padding, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn)) + self.conv_list.append( + Conv1D( + num_channels=n_mels * outputs_per_step, + num_filters=num_hidden, + filter_size=filter_size, + padding=padding, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn)) k = math.sqrt(1 / num_hidden) - for _ in range(1, num_conv-1): - self.conv_list.append(Conv1D(num_channels = num_hidden, - num_filters = num_hidden, - filter_size = filter_size, - padding = padding, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn)) + for _ in range(1, num_conv - 1): + self.conv_list.append( + Conv1D( + num_channels=num_hidden, + num_filters=num_hidden, + filter_size=filter_size, + padding=padding, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn)) - self.conv_list.append(Conv1D(num_channels = num_hidden, - num_filters = n_mels * outputs_per_step, - filter_size = filter_size, - padding = padding, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn)) + self.conv_list.append( + Conv1D( + num_channels=num_hidden, + num_filters=n_mels * outputs_per_step, + filter_size=filter_size, + padding=padding, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn)) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) - self.batch_norm_list = [dg.BatchNorm(num_hidden, - data_layout='NCHW') for _ in range(num_conv-1)] + self.batch_norm_list = [ + dg.BatchNorm( + num_hidden, data_layout='NCHW') for _ in range(num_conv - 1) + ] if self.batchnorm_last: - self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, - data_layout='NCHW')) + self.batch_norm_list.append( + dg.BatchNorm( + n_mels * outputs_per_step, data_layout='NCHW')) for i, layer in enumerate(self.batch_norm_list): self.add_sublayer("batch_norm_list_{}".format(i), layer) - def forward(self, input): """ @@ -69,18 +100,19 @@ class PostConvNet(dg.Layer): Returns: output (Variable), Shape(B, T, C), the result after postconvnet. """ - - input = layers.transpose(input, [0,2,1]) + + input = layers.transpose(input, [0, 2, 1]) len = input.shape[-1] - for i in range(self.num_conv-1): + for i in range(self.num_conv - 1): batch_norm = self.batch_norm_list[i] conv = self.conv_list[i] - - input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) - conv = self.conv_list[self.num_conv-1] - input = conv(input)[:,:,:len] + + input = layers.dropout( + layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout) + conv = self.conv_list[self.num_conv - 1] + input = conv(input)[:, :, :len] if self.batchnorm_last: - batch_norm = self.batch_norm_list[self.num_conv-1] + batch_norm = self.batch_norm_list[self.num_conv - 1] input = layers.dropout(batch_norm(input), self.dropout) - output = layers.transpose(input, [0,2,1]) - return output \ No newline at end of file + output = layers.transpose(input, [0, 2, 1]) + return output diff --git a/parakeet/models/transformer_tts/prenet.py b/parakeet/models/transformer_tts/prenet.py index e9b0667..6039b60 100644 --- a/parakeet/models/transformer_tts/prenet.py +++ b/parakeet/models/transformer_tts/prenet.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid import paddle.fluid.layers as layers + class PreNet(dg.Layer): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): """ @@ -17,13 +31,21 @@ class PreNet(dg.Layer): self.dropout_rate = dropout_rate k = math.sqrt(1 / input_size) - self.linear1 = dg.Linear(input_size, hidden_size, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + self.linear1 = dg.Linear( + input_size, + hidden_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) k = math.sqrt(1 / hidden_size) - self.linear2 = dg.Linear(hidden_size, output_size, - param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), - bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) + self.linear2 = dg.Linear( + hidden_size, + output_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k))) def forward(self, x): """ diff --git a/parakeet/models/transformer_tts/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py index bf2924a..1205c6b 100644 --- a/parakeet/models/transformer_tts/transformer_tts.py +++ b/parakeet/models/transformer_tts/transformer_tts.py @@ -1,8 +1,22 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.models.transformer_tts.encoder import Encoder from parakeet.models.transformer_tts.decoder import Decoder + class TransformerTTS(dg.Layer): def __init__(self, config): super(TransformerTTS, self).__init__() @@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer): self.config = config def forward(self, characters, mel_input, pos_text, pos_mel): - + key, c_mask, attns_enc = self.encoder(characters, pos_text) - - mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel) + + mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder( + key, key, mel_input, c_mask, pos_mel) return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec - - - - - - - diff --git a/parakeet/models/transformer_tts/utils.py b/parakeet/models/transformer_tts/utils.py index ab575f9..2212744 100644 --- a/parakeet/models/transformer_tts/utils.py +++ b/parakeet/models/transformer_tts/utils.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np import librosa import os, copy @@ -6,14 +19,15 @@ import paddle.fluid.layers as layers def get_positional_table(d_pos_vec, n_position=1024): - position_enc = np.array([ - [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)] - if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + position_enc = np.array( + [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)] + if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) - position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i - position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 return position_enc + def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): ''' Sinusoid position encoding table ''' @@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def get_posi_angle_vec(position): return [cal_angle(position, hid_j) for hid_j in range(d_hid)] - sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table = np.array( + [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 @@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): return sinusoid_table + def get_non_pad_mask(seq): - return layers.unsqueeze((seq != 0).astype(np.float32),[-1]) + return layers.unsqueeze((seq != 0).astype(np.float32), [-1]) + def get_attn_key_pad_mask(seq_k, seq_q): ''' For masking out the padding part of key sequence. ''' @@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q): # Expand to fit the shape of key query attention matrix. len_q = seq_q.shape[1] padding_mask = (seq_k != 0).astype(np.float32) - padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) + padding_mask = layers.expand( + layers.unsqueeze(padding_mask, [1]), [1, len_q, 1]) return padding_mask + def get_triu_tensor(seq_k, seq_q): ''' For make a triu tensor ''' len_k = seq_k.shape[1] len_q = seq_q.shape[1] batch_size = seq_k.shape[0] triu_tensor = np.triu(np.ones([len_k, len_q]), 1) - triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0) - + triu_tensor = np.repeat( + np.expand_dims( + triu_tensor, axis=0), batch_size, axis=0) + return triu_tensor + def guided_attention(N, T, g=0.2): '''Guided attention. Refer to page 3 on the paper.''' W = np.zeros((N, T), dtype=np.float32) for n_pos in range(W.shape[0]): for t_pos in range(W.shape[1]): - W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) + W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) + **2 / (2 * g * g)) return W def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30): - output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) + output = -1 * label * layers.log(input + epsilon) - ( + 1 - label) * layers.log(1 - input + epsilon) output = output * (label * (position_weight - 1) + 1) return layers.reduce_sum(output, dim=[0, 1]) - - diff --git a/parakeet/models/transformer_tts/vocoder.py b/parakeet/models/transformer_tts/vocoder.py index 3fa19a6..33ffe1c 100644 --- a/parakeet/models/transformer_tts/vocoder.py +++ b/parakeet/models/transformer_tts/vocoder.py @@ -1,27 +1,44 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.modules.customized import Conv1D from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.cbhg import CBHG + class Vocoder(dg.Layer): """ CBHG Network (mel -> linear) """ + def __init__(self, config, batch_size): super(Vocoder, self).__init__() - self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], - num_filters = config['hidden_size'], - filter_size=1) + self.pre_proj = Conv1D( + num_channels=config['audio']['num_mels'], + num_filters=config['hidden_size'], + filter_size=1) self.cbhg = CBHG(config['hidden_size'], batch_size) - self.post_proj = Conv1D(num_channels = config['hidden_size'], - num_filters = (config['audio']['n_fft'] // 2) + 1, - filter_size=1) + self.post_proj = Conv1D( + num_channels=config['hidden_size'], + num_filters=(config['audio']['n_fft'] // 2) + 1, + filter_size=1) def forward(self, mel): - mel = layers.transpose(mel, [0,2,1]) + mel = layers.transpose(mel, [0, 2, 1]) mel = self.pre_proj(mel) mel = self.cbhg(mel) mag_pred = self.post_proj(mel) - mag_pred = layers.transpose(mag_pred, [0,2,1]) + mag_pred = layers.transpose(mag_pred, [0, 2, 1]) return mag_pred diff --git a/parakeet/models/waveflow/__init__.py b/parakeet/models/waveflow/__init__.py index 20475cd..73a7914 100644 --- a/parakeet/models/waveflow/__init__.py +++ b/parakeet/models/waveflow/__init__.py @@ -1 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.models.waveflow.waveflow import WaveFlow diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py index b5ad2c9..0c1e914 100644 --- a/parakeet/models/waveflow/data.py +++ b/parakeet/models/waveflow/data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import random import librosa diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py index 1b1b8bf..a8bd8af 100644 --- a/parakeet/models/waveflow/waveflow.py +++ b/parakeet/models/waveflow/waveflow.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import os import time diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py index 1b8938a..e5b9a3e 100644 --- a/parakeet/models/waveflow/waveflow_modules.py +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import numpy as np import paddle.fluid.dygraph as dg diff --git a/parakeet/models/wavenet/README.md b/parakeet/models/wavenet/README.md index 18efd0b..21a0f92 100644 --- a/parakeet/models/wavenet/README.md +++ b/parakeet/models/wavenet/README.md @@ -2,7 +2,7 @@ Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms. WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499). -Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. +Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures. @@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \ #### Save and Load checkpoints Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default. -The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. +The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): -1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. +1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. 2. Use `--iteration=500000`. 3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`. @@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \ --root=./data/LJSpeech-1.1 \ --name=${ModelName} --use_gpu=true \ --output=./syn_audios \ - --sample=${SAMPLE} + --sample=${SAMPLE} ``` In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. diff --git a/parakeet/models/wavenet/data.py b/parakeet/models/wavenet/data.py index a4f1b70..db19667 100644 --- a/parakeet/models/wavenet/data.py +++ b/parakeet/models/wavenet/data.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import random import librosa @@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech): self.fft_window_shift = config.fft_window_shift # Calculate context frames. frames_per_second = config.sample_rate // self.fft_window_shift - train_clip_frames = int(np.ceil( - config.train_clip_second * frames_per_second)) + train_clip_frames = int( + np.ceil(config.train_clip_second * frames_per_second)) context_frames = config.context_size // self.fft_window_shift self.num_frames = train_clip_frames + context_frames @@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech): fft_window_shift = config.fft_window_shift fft_window_size = config.fft_window_size fft_size = config.fft_size - + audio, loaded_sr = librosa.load(wav_path, sr=None) assert loaded_sr == sr @@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech): fft_padding = (fft_size - fft_window_shift) // 2 desired_length = frames * fft_window_shift + fft_padding * 2 pad_amount = (desired_length - audio.size) // 2 - + if audio.size % 2 == 0: audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect') else: audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect') - + # Normalize audio. audio = audio / np.abs(audio).max() * 0.999 - + # Compute mel-spectrogram. # Turn center to False to prevent internal padding. spectrogram = librosa.core.stft( - audio, hop_length=fft_window_shift, - win_length=fft_window_size, n_fft=fft_size, center=False) + audio, + hop_length=fft_window_shift, + win_length=fft_window_size, + n_fft=fft_size, + center=False) spectrogram_magnitude = np.abs(spectrogram) - + # Compute mel-spectrograms. - mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size, + mel_filter_bank = librosa.filters.mel(sr=sr, + n_fft=fft_size, n_mels=config.mel_bands) mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude) mel_spectrogram = mel_spectrogram.T - + # Rescale mel_spectrogram. min_level, ref_level = 1e-5, 20 mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram)) mel_spectrogram = mel_spectrogram - ref_level mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1) - + # Extract the center of audio that corresponds to mel spectrograms. - audio = audio[fft_padding : -fft_padding] + audio = audio[fft_padding:-fft_padding] assert mel_spectrogram.shape[0] * fft_window_shift == audio.size return audio, mel_spectrogram -class Subset(dataset.Dataset): +class Subset(dataset.Dataset): def __init__(self, dataset, indices, valid): self.dataset = dataset self.indices = indices @@ -100,23 +118,23 @@ class Subset(dataset.Dataset): audio_start = frame_start * fft_window_shift audio_end = frame_end * fft_window_shift - - audio = audio[audio_start : audio_end] + + audio = audio[audio_start:audio_end] return audio, mel, audio_start def _batch_examples(self, batch): audios = [sample[0] for sample in batch] audio_starts = [sample[2] for sample in batch] - + # mels shape [num_frames, mel_bands] - max_frames = max(sample[1].shape[0] for sample in batch) + max_frames = max(sample[1].shape[0] for sample in batch) mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch] - + audios = np.array(audios, dtype=np.float32) mels = np.array(mels, dtype=np.float32) audio_starts = np.array(audio_starts, dtype=np.int32) - + return audios, mels, audio_starts def __len__(self): @@ -138,17 +156,17 @@ class LJSpeech: # Train dataset. trainset = Subset(ds, train_indices, valid=False) - sampler = DistributedSampler(len(trainset), nranks, rank) + sampler = DistributedSampler(len(trainset), nranks, rank) total_bs = config.batch_size assert total_bs % nranks == 0 - train_sampler = BatchSampler(sampler, total_bs // nranks, - drop_last=True) + train_sampler = BatchSampler( + sampler, total_bs // nranks, drop_last=True) trainloader = DataCargo(trainset, batch_sampler=train_sampler) trainreader = fluid.io.PyReader(capacity=50, return_list=True) trainreader.decorate_batch_generator(trainloader, place) self.trainloader = (data for _ in iter(int, 1) - for data in trainreader()) + for data in trainreader()) # Valid dataset. validset = Subset(ds, valid_indices, valid=True) @@ -156,5 +174,5 @@ class LJSpeech: validloader = DataCargo(validset, batch_size=1, shuffle=False) validreader = fluid.io.PyReader(capacity=20, return_list=True) - validreader.decorate_batch_generator(validloader, place) + validreader.decorate_batch_generator(validloader, place) self.validloader = validreader diff --git a/parakeet/models/wavenet/slurm.py b/parakeet/models/wavenet/slurm.py index 47af2dc..dfd22e4 100644 --- a/parakeet/models/wavenet/slurm.py +++ b/parakeet/models/wavenet/slurm.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Utility module for restarting training when using SLURM. """ @@ -45,8 +58,8 @@ def parse_time(text): try: return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds) except ValueError as e: - raise ValueError("Error parsing time {}. Got error {}.".format( - text, str(e))) + raise ValueError("Error parsing time {}. Got error {}.".format(text, + str(e))) def restart_command(): @@ -76,8 +89,10 @@ def restart_command(): gres, partition = info.get("Gres"), info.get("Partition") stderr, stdout = info.get("StdErr"), info.get("StdOut") job_name = info.get("JobName") - command = ["sbatch", "--job-name={}".format(job_name), - "--ntasks={}".format(num_tasks)] + command = [ + "sbatch", "--job-name={}".format(job_name), + "--ntasks={}".format(num_tasks) + ] if partition: command.extend(["--partition", partition]) @@ -98,12 +113,13 @@ def restart_command(): dist_setting = ['-m', 'paddle.distributed.launch'] wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv - command.append( - "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd))) + command.append("--wrap={}".format(" ".join( + shlex.quote(arg) for arg in wrap_cmd))) time_limit_string = info["TimeLimit"] if time_limit_string.lower() == "unlimited": - print("UNLIMITED detected: restart OFF, infinite learning ON.", - flush=True) + print( + "UNLIMITED detected: restart OFF, infinite learning ON.", + flush=True) return command, None time_limit = parse_time(time_limit_string) runtime = parse_time(info["RunTime"]) diff --git a/parakeet/models/wavenet/synthesis.py b/parakeet/models/wavenet/synthesis.py index d87a188..43d78de 100644 --- a/parakeet/models/wavenet/synthesis.py +++ b/parakeet/models/wavenet/synthesis.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import random from pprint import pprint @@ -12,25 +26,42 @@ from wavenet import WaveNet def add_options_to_parser(parser): - parser.add_argument('--model', type=str, default='wavenet', + parser.add_argument( + '--model', + type=str, + default='wavenet', help="general name of the model") - parser.add_argument('--name', type=str, - help="specific name of the training model") - parser.add_argument('--root', type=str, - help="root path of the LJSpeech dataset") + parser.add_argument( + '--name', type=str, help="specific name of the training model") + parser.add_argument( + '--root', type=str, help="root path of the LJSpeech dataset") - parser.add_argument('--use_gpu', type=bool, default=True, + parser.add_argument( + '--use_gpu', + type=bool, + default=True, help="option to use gpu training") - parser.add_argument('--iteration', type=int, default=None, + parser.add_argument( + '--iteration', + type=int, + default=None, help=("which iteration of checkpoint to load, " "default to load the latest checkpoint")) - parser.add_argument('--checkpoint', type=str, default=None, + parser.add_argument( + '--checkpoint', + type=str, + default=None, help="path of the checkpoint to load") - parser.add_argument('--output', type=str, default="./syn_audios", + parser.add_argument( + '--output', + type=str, + default="./syn_audios", help="path to write synthesized audio files") - parser.add_argument('--sample', type=int, + parser.add_argument( + '--sample', + type=int, help="which of the valid samples to synthesize audio") @@ -52,7 +83,7 @@ def synthesize(config): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed print("Random Seed: ", seed) - + # Build model. model = WaveNet(config, checkpoint_dir) model.build(training=False) diff --git a/parakeet/models/wavenet/train.py b/parakeet/models/wavenet/train.py index 1a17bbd..7ebf58d 100644 --- a/parakeet/models/wavenet/train.py +++ b/parakeet/models/wavenet/train.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import random import subprocess @@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60 def add_options_to_parser(parser): - parser.add_argument('--model', type=str, default='wavenet', + parser.add_argument( + '--model', + type=str, + default='wavenet', help="general name of the model") - parser.add_argument('--name', type=str, - help="specific name of the training model") - parser.add_argument('--root', type=str, - help="root path of the LJSpeech dataset") + parser.add_argument( + '--name', type=str, help="specific name of the training model") + parser.add_argument( + '--root', type=str, help="root path of the LJSpeech dataset") - parser.add_argument('--parallel', type=bool, default=True, + parser.add_argument( + '--parallel', + type=bool, + default=True, help="option to use data parallel training") - parser.add_argument('--use_gpu', type=bool, default=True, + parser.add_argument( + '--use_gpu', + type=bool, + default=True, help="option to use gpu training") - parser.add_argument('--iteration', type=int, default=None, + parser.add_argument( + '--iteration', + type=int, + default=None, help=("which iteration of checkpoint to load, " "default to load the latest checkpoint")) - parser.add_argument('--checkpoint', type=str, default=None, + parser.add_argument( + '--checkpoint', + type=str, + default=None, help="path of the checkpoint to load") - parser.add_argument('--slurm', type=bool, default=False, + parser.add_argument( + '--slurm', + type=bool, + default=False, help="whether you are using slurm to submit training jobs") @@ -104,8 +136,8 @@ def train(config): # Check whether reaching the time limit. if config.slurm: - done = (death_time is not None and death_time - time.time() < - MAXIMUM_SAVE_TIME) + done = (death_time is not None and + death_time - time.time() < MAXIMUM_SAVE_TIME) if rank == 0 and done: print("Saving progress before exiting.") @@ -127,8 +159,8 @@ def train(config): if __name__ == "__main__": # Create parser. - parser = jsonargparse.ArgumentParser(description="Train WaveNet model", - formatter_class='default_argparse') + parser = jsonargparse.ArgumentParser( + description="Train WaveNet model", formatter_class='default_argparse') add_options_to_parser(parser) utils.add_config_options_to_parser(parser) @@ -136,4 +168,4 @@ if __name__ == "__main__": # For conflicting updates to the same field, # the preceding update will be overwritten by the following one. config = parser.parse_args() - train(config) + train(config) diff --git a/parakeet/models/wavenet/utils.py b/parakeet/models/wavenet/utils.py index c2b6601..bb21b93 100644 --- a/parakeet/models/wavenet/utils.py +++ b/parakeet/models/wavenet/utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import os import time @@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg def add_config_options_to_parser(parser): - parser.add_argument('--valid_size', type=int, - help="size of the valid dataset") - parser.add_argument('--train_clip_second', type=float, + parser.add_argument( + '--valid_size', type=int, help="size of the valid dataset") + parser.add_argument( + '--train_clip_second', + type=float, help="the length of audio clip for training") - parser.add_argument('--sample_rate', type=int, - help="sampling rate of audio data file") - parser.add_argument('--fft_window_shift', type=int, + parser.add_argument( + '--sample_rate', type=int, help="sampling rate of audio data file") + parser.add_argument( + '--fft_window_shift', + type=int, help="the shift of fft window for each frame") - parser.add_argument('--fft_window_size', type=int, + parser.add_argument( + '--fft_window_size', + type=int, help="the size of fft window for each frame") - parser.add_argument('--fft_size', type=int, - help="the size of fft filter on each frame") - parser.add_argument('--mel_bands', type=int, + parser.add_argument( + '--fft_size', type=int, help="the size of fft filter on each frame") + parser.add_argument( + '--mel_bands', + type=int, help="the number of mel bands when calculating mel spectrograms") - parser.add_argument('--seed', type=int, - help="seed of random initialization for the model") - parser.add_argument('--batch_size', type=int, - help="batch size for training") - parser.add_argument('--test_every', type=int, - help="test interval during training") - parser.add_argument('--save_every', type=int, + parser.add_argument( + '--seed', type=int, help="seed of random initialization for the model") + parser.add_argument( + '--batch_size', type=int, help="batch size for training") + parser.add_argument( + '--test_every', type=int, help="test interval during training") + parser.add_argument( + '--save_every', + type=int, help="checkpointing interval during training") - parser.add_argument('--max_iterations', type=int, - help="maximum training iterations") + parser.add_argument( + '--max_iterations', type=int, help="maximum training iterations") - parser.add_argument('--layers', type=int, - help="number of dilated convolution layers") - parser.add_argument('--kernel_width', type=int, - help="dilated convolution kernel width") - parser.add_argument('--dilation_block', type=list, - help="dilated convolution kernel width") + parser.add_argument( + '--layers', type=int, help="number of dilated convolution layers") + parser.add_argument( + '--kernel_width', type=int, help="dilated convolution kernel width") + parser.add_argument( + '--dilation_block', type=list, help="dilated convolution kernel width") parser.add_argument('--residual_channels', type=int) parser.add_argument('--skip_channels', type=int) - parser.add_argument('--loss_type', type=str, - help="mix-gaussian-pdf or softmax") - parser.add_argument('--num_channels', type=int, default=None, + parser.add_argument( + '--loss_type', type=str, help="mix-gaussian-pdf or softmax") + parser.add_argument( + '--num_channels', + type=int, + default=None, help="number of channels for softmax output") - parser.add_argument('--num_mixtures', type=int, default=None, + parser.add_argument( + '--num_mixtures', + type=int, + default=None, help="number of gaussian mixtures for gaussian output") - parser.add_argument('--log_scale_min', type=float, default=None, + parser.add_argument( + '--log_scale_min', + type=float, + default=None, help="minimum clip value of log variance of gaussian output") - parser.add_argument('--conditioner.filter_sizes', type=list, + parser.add_argument( + '--conditioner.filter_sizes', + type=list, help="conv2d tranpose op filter sizes for building conditioner") - parser.add_argument('--conditioner.upsample_factors', type=list, + parser.add_argument( + '--conditioner.upsample_factors', + type=list, help="list of upsample factors for building conditioner") parser.add_argument('--learning_rate', type=float) parser.add_argument('--gradient_max_norm', type=float) - parser.add_argument('--anneal.every', type=int, + parser.add_argument( + '--anneal.every', + type=int, help="step interval for annealing learning rate") parser.add_argument('--anneal.rate', type=float) @@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration): handle.write("model_checkpoint_path: step-{}".format(iteration)) -def load_parameters(checkpoint_dir, rank, model, optimizer=None, - iteration=None, file_path=None): +def load_parameters(checkpoint_dir, + rank, + model, + optimizer=None, + iteration=None, + file_path=None): if file_path is None: if iteration is None: iteration = load_latest_checkpoint(checkpoint_dir, rank) @@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None, if optimizer and optimizer_dict: optimizer.set_dict(optimizer_dict) print("[checkpoint] Rank {}: loaded optimizer state from {}".format( - rank, file_path)) + rank, file_path)) def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None): diff --git a/parakeet/models/wavenet/wavenet.py b/parakeet/models/wavenet/wavenet.py index c636c4b..db7a06e 100644 --- a/parakeet/models/wavenet/wavenet.py +++ b/parakeet/models/wavenet/wavenet.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import os import time @@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule class WaveNet(): - def __init__(self, config, checkpoint_dir, parallel=False, rank=0, - nranks=1, tb_logger=None): + def __init__(self, + config, + checkpoint_dir, + parallel=False, + rank=0, + nranks=1, + tb_logger=None): # Process config to calculate the context size dilations = list( itertools.islice( @@ -29,12 +48,12 @@ class WaveNet(): def build(self, training=True): config = self.config - dataset = LJSpeech(config, self.nranks, self.rank) + dataset = LJSpeech(config, self.nranks, self.rank) self.trainloader = dataset.trainloader self.validloader = dataset.validloader wavenet = WaveNetModule("wavenet", config, self.rank) - + # Dry run once to create and initalize all necessary parameters. audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32)) mel = dg.to_variable( @@ -45,38 +64,44 @@ class WaveNet(): if training: # Create Learning rate scheduler. lr_scheduler = dg.ExponentialDecay( - learning_rate = config.learning_rate, - decay_steps = config.anneal.every, - decay_rate = config.anneal.rate, + learning_rate=config.learning_rate, + decay_steps=config.anneal.every, + decay_rate=config.anneal.rate, staircase=True) - + optimizer = fluid.optimizer.AdamOptimizer( learning_rate=lr_scheduler) - + clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm( config.gradient_max_norm) # Load parameters. - utils.load_parameters(self.checkpoint_dir, self.rank, - wavenet, optimizer, - iteration=config.iteration, - file_path=config.checkpoint) + utils.load_parameters( + self.checkpoint_dir, + self.rank, + wavenet, + optimizer, + iteration=config.iteration, + file_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) - + # Data parallelism. if self.parallel: strategy = dg.parallel.prepare_context() wavenet = dg.parallel.DataParallel(wavenet, strategy) - + self.wavenet = wavenet self.optimizer = optimizer self.clipper = clipper else: # Load parameters. - utils.load_parameters(self.checkpoint_dir, self.rank, wavenet, - iteration=config.iteration, - file_path=config.checkpoint) + utils.load_parameters( + self.checkpoint_dir, + self.rank, + wavenet, + iteration=config.iteration, + file_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) self.wavenet = wavenet @@ -104,7 +129,9 @@ class WaveNet(): else: current_lr = self.optimizer._learning_rate - self.optimizer.minimize(loss, grad_clip=self.clipper, + self.optimizer.minimize( + loss, + grad_clip=self.clipper, parameter_list=self.wavenet.parameters()) self.wavenet.clear_gradients() @@ -143,10 +170,16 @@ class WaveNet(): tb = self.tb_logger tb.add_scalar("Valid-Avg-Loss", loss_val, iteration) - tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(), - iteration, sample_rate=self.config.sample_rate) - tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(), - iteration, sample_rate=self.config.sample_rate) + tb.add_audio( + "Teacher-Forced-Audio-0", + sample_audios[0].numpy(), + iteration, + sample_rate=self.config.sample_rate) + tb.add_audio( + "Teacher-Forced-Audio-1", + sample_audios[1].numpy(), + iteration, + sample_rate=self.config.sample_rate) @dg.no_grad def infer(self, iteration): @@ -165,10 +198,9 @@ class WaveNet(): start_time = time.time() syn_audio = self.wavenet.synthesize(mels_list[sample]) syn_time = time.time() - start_time - print("audio shape {}, synthesis time {}".format( - syn_audio.shape, syn_time)) - librosa.output.write_wav(filename, syn_audio, - sr=config.sample_rate) + print("audio shape {}, synthesis time {}".format(syn_audio.shape, + syn_time)) + librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate) def save(self, iteration): utils.save_latest_parameters(self.checkpoint_dir, iteration, diff --git a/parakeet/models/wavenet/wavenet_modules.py b/parakeet/models/wavenet/wavenet_modules.py index fbab741..2c62643 100644 --- a/parakeet/models/wavenet/wavenet_modules.py +++ b/parakeet/models/wavenet/wavenet_modules.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import itertools import numpy as np @@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'): def extract_slices(x, audio_starts, audio_length, rank): slices = [] - for i in range(x.shape[0]): + for i in range(x.shape[0]): start = audio_starts.numpy()[i] end = start + audio_length slice = fluid.layers.slice( - x, axes=[0, 1], starts=[i, start], ends=[i+1, end]) + x, axes=[0, 1], starts=[i, start], ends=[i + 1, end]) slices.append(fluid.layers.squeeze(slice, [0])) x = fluid.layers.stack(slices, axis=0) @@ -50,7 +64,7 @@ class Conditioner(dg.Layer): # Register python list as parameters. for i, layer in enumerate(self.deconvs): self.add_sublayer("conv_transpose_{}".format(i), layer) - + def forward(self, x): x = fluid.layers.unsqueeze(x, 1) for layer in self.deconvs: @@ -62,7 +76,7 @@ class Conditioner(dg.Layer): class WaveNetModule(dg.Layer): def __init__(self, name_scope, config, rank): super(WaveNetModule, self).__init__(name_scope) - + self.rank = rank self.conditioner = Conditioner(self.full_name(), config) self.dilations = list( @@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer): embed_dim=config.residual_channels, std=0.1) elif config.loss_type == "mix-gaussian-pdf": - self.embedding_fc = modules.FC( - self.full_name(), - in_features=1, - size=config.residual_channels, - num_flatten_dims=2, - relu=False) + self.embedding_fc = modules.FC(self.full_name(), + in_features=1, + size=config.residual_channels, + num_flatten_dims=2, + relu=False) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format(loss_type)) self.dilated_causal_convs = [] for dilation in self.dilations: @@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer): num_filters=config.residual_channels, filter_size=config.kernel_width, dilation=dilation, - causal=True - ) - ) + causal=True)) for i, layer in enumerate(self.dilated_causal_convs): - self.add_sublayer("dilated_causal_conv_{}".format(i), layer) + self.add_sublayer("dilated_causal_conv_{}".format(i), layer) - self.fc1 = modules.FC( - self.full_name(), - in_features=config.residual_channels, - size=config.skip_channels, - num_flatten_dims=2, - relu=True, - act="relu") + self.fc1 = modules.FC(self.full_name(), + in_features=config.residual_channels, + size=config.skip_channels, + num_flatten_dims=2, + relu=True, + act="relu") - self.fc2 = modules.FC( - self.full_name(), - in_features=config.skip_channels, - size=config.skip_channels, - num_flatten_dims=2, - relu=True, - act="relu") + self.fc2 = modules.FC(self.full_name(), + in_features=config.skip_channels, + size=config.skip_channels, + num_flatten_dims=2, + relu=True, + act="relu") if config.loss_type == "softmax": - self.fc3 = modules.FC( - self.full_name(), - in_features=config.skip_channels, - size=config.num_channels, - num_flatten_dims=2, - relu=False) + self.fc3 = modules.FC(self.full_name(), + in_features=config.skip_channels, + size=config.num_channels, + num_flatten_dims=2, + relu=False) elif config.loss_type == "mix-gaussian-pdf": - self.fc3 = modules.FC( - self.full_name(), - in_features=config.skip_channels, - size=3 * config.num_mixtures, - num_flatten_dims=2, - relu=False) + self.fc3 = modules.FC(self.full_name(), + in_features=config.skip_channels, + size=3 * config.num_mixtures, + num_flatten_dims=2, + relu=False) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format(loss_type)) def sample_softmax(self, mix_parameters): batch, length, hidden = mix_parameters.shape mix_param_2d = fluid.layers.reshape(mix_parameters, - [batch * length, hidden]) + [batch * length, hidden]) mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1) # quantized: [batch * length] - quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d), - dtype="float32") + quantized = fluid.layers.cast( + fluid.layers.sampling_id(mix_param_2d), dtype="float32") samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0 # samples: [batch * length] @@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer): # to [bs * len, 3 * num_mixtures]. batch, length, hidden = mix_parameters.shape mix_param_2d = fluid.layers.reshape(mix_parameters, - [batch * length, hidden]) + [batch * length, hidden]) K = hidden // 3 # Unpack the parameters of the mixture of gaussian. - logits_pi = mix_param_2d[:, 0 : K] - mu = mix_param_2d[:, K : 2*K] - log_s = mix_param_2d[:, 2*K : 3*K] + logits_pi = mix_param_2d[:, 0:K] + mu = mix_param_2d[:, K:2 * K] + log_s = mix_param_2d[:, 2 * K:3 * K] s = fluid.layers.exp(log_s) pi = fluid.layers.softmax(logits_pi, axis=-1) comp_samples = fluid.layers.sampling_id(pi) - + row_idx = dg.to_variable(np.arange(batch * length)) comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1) mu_comp = fluid.layers.gather_nd(mu, comp_samples) - s_comp = fluid.layers.gather_nd(s, comp_samples) + s_comp = fluid.layers.gather_nd(s, comp_samples) # N(0, 1) normal sample. u = fluid.layers.gaussian_random(shape=[batch * length]) @@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer): # Calculate gaussian loss. targets = fluid.layers.unsqueeze(targets, -1) - targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures]) - x_std = inv_s * (targets - mu) + targets = fluid.layers.expand(targets, + [1, 1, self.config.num_mixtures]) + x_std = inv_s * (targets - mu) exponent = fluid.layers.exp(-0.5 * x_std * x_std) pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent pdf_x = pi * pdf_x @@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer): # Slice conditioners. audio_length = audios.shape[1] - conditioner = extract_slices(full_conditioner, - audio_starts, audio_length, self.rank) - + conditioner = extract_slices(full_conditioner, audio_starts, + audio_length, self.rank) + # input_audio, target_audio: [bs, len] input_audios = audios[:, :-1] target_audios = audios[:, 1:] @@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer): layer_input = self.embedding_fc( fluid.layers.unsqueeze(input_audios, 2)) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format(loss_type)) # layer_input: [bs, res_channel, 1, len] layer_input = fluid.layers.unsqueeze( - fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2) + fluid.layers.transpose( + layer_input, perm=[0, 2, 1]), 2) # conditioner: [bs, mel_bands, 1, len] conditioner = fluid.layers.unsqueeze( - fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2) + fluid.layers.transpose( + conditioner, perm=[0, 2, 1]), 2) skip = None for i, layer in enumerate(self.dilated_causal_convs): @@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer): elif loss_type == "mix-gaussian-pdf": sample_audios = self.sample_mix_gaussian(mix_parameters) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format( + loss_type)) if loss_type == "softmax": loss = self.softmax_loss(target_audios, mix_parameters) elif loss_type == "mix-gaussian-pdf": - loss = self.mixture_density_loss(target_audios, - mix_parameters, self.log_scale_min) + loss = self.mixture_density_loss(target_audios, mix_parameters, + self.log_scale_min) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format(loss_type)) return loss, sample_audios def synthesize(self, mels): self.start_new_sequence() - bs, n_frames, mel_bands = mels.shape + bs, n_frames, mel_bands = mels.shape conditioner = self.conditioner(mels) time_steps = conditioner.shape[1] @@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer): elif loss_type == "mix-gaussian-pdf": audio_input = self.embedding_fc(current_sample) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format( + loss_type)) # [bs, channel, 1, 1] audio_input = fluid.layers.unsqueeze( - fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2) + fluid.layers.transpose( + audio_input, perm=[0, 2, 1]), 2) # [bs, mel_bands] cond_input = conditioner[:, i, :] # [bs, mel_bands, 1, 1] - cond_input = fluid.layers.reshape( - cond_input, cond_input.shape + [1, 1]) + cond_input = fluid.layers.reshape(cond_input, + cond_input.shape + [1, 1]) skip = None for layer in self.dilated_causal_convs: - audio_input, skip = layer.add_input( - audio_input, skip, cond_input) - + audio_input, skip = layer.add_input(audio_input, skip, + cond_input) + # [bs, 1, channel] skip = fluid.layers.transpose( fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1]) @@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer): elif loss_type == "mix-gaussian-pdf": sample = self.sample_mix_gaussian(mix_parameters) else: - raise ValueError( - "loss_type {} is unsupported!".format(loss_type)) + raise ValueError("loss_type {} is unsupported!".format( + loss_type)) audio_samples.append(sample) # [bs] current_sample = audio_samples[-1] # [bs, 1, 1] - current_sample = fluid.layers.reshape(current_sample, - current_sample.shape + [1, 1]) + current_sample = fluid.layers.reshape( + current_sample, current_sample.shape + [1, 1]) # syn_audio: [num_samples] syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy() - return syn_audio + return syn_audio def start_new_sequence(self): for layer in self.sublayers(): diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py index 8123194..d964a59 100644 --- a/parakeet/modules/__init__.py +++ b/parakeet/modules/__init__.py @@ -1,2 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from . import weight_norm from .customized import * \ No newline at end of file diff --git a/parakeet/modules/customized.py b/parakeet/modules/customized.py index dc2259a..783625a 100644 --- a/parakeet/modules/customized.py +++ b/parakeet/modules/customized.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from paddle import fluid import paddle.fluid.layers as F import paddle.fluid.dygraph as dg @@ -7,14 +21,15 @@ class Pool1D(dg.Layer): """ A Pool 1D block implemented with Pool2D. """ + def __init__(self, - pool_size=-1, - pool_type='max', - pool_stride=1, - pool_padding=0, - global_pooling=False, - use_cudnn=True, - ceil_mode=False, + pool_size=-1, + pool_type='max', + pool_stride=1, + pool_padding=0, + global_pooling=False, + use_cudnn=True, + ceil_mode=False, exclusive=True, data_format='NCT'): super(Pool1D, self).__init__() @@ -28,13 +43,16 @@ class Pool1D(dg.Layer): self.exclusive = exclusive self.data_format = data_format + self.pool2d = dg.Pool2D( + [1, pool_size], + pool_type=pool_type, + pool_stride=[1, pool_stride], + pool_padding=[0, pool_padding], + global_pooling=global_pooling, + use_cudnn=use_cudnn, + ceil_mode=ceil_mode, + exclusive=exclusive) - self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type, - pool_stride = [1,pool_stride], pool_padding = [0, pool_padding], - global_pooling = global_pooling, use_cudnn = use_cudnn, - ceil_mode = ceil_mode, exclusive = exclusive) - - def forward(self, x): """ Args: @@ -53,12 +71,14 @@ class Pool1D(dg.Layer): x = fluid.layers.transpose(x, [0, 2, 1]) return x + class Conv1D(dg.Conv2D): """A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and use (B, C, 1, T) data layout to compute 1D convolution. Nothing more. NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple layer, instead of a complex one. So we can easily apply weight norm to it. """ + def __init__(self, num_channels, num_filters, @@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D): use_cudnn=True, act=None, dtype='float32'): - super(Conv1D, self).__init__(num_channels, - num_filters, (1, filter_size), - stride=(1, stride), - padding=(0, padding), - dilation=(1, dilation), - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) + super(Conv1D, self).__init__( + num_channels, + num_filters, (1, filter_size), + stride=(1, stride), + padding=(0, padding), + dilation=(1, dilation), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) def forward(self, x): x = F.unsqueeze(x, [2]) @@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose): use_cudnn=True, act=None, dtype='float32'): - super(Conv1DTranspose, self).__init__(num_channels, - num_filters, (1, filter_size), - output_size=None, - padding=(0, padding), - stride=(1, stride), - dilation=(1, dilation), - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) + super(Conv1DTranspose, self).__init__( + num_channels, + num_filters, (1, filter_size), + output_size=None, + padding=(0, padding), + stride=(1, stride), + dilation=(1, dilation), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) def forward(self, x): x = F.unsqueeze(x, [2]) @@ -134,6 +156,7 @@ class Conv1DCell(Conv1D): It is a cell that it acts like an RNN cell. It does not support stride > 1, and it ensures 1-to-1 mapping from input time steps to output timesteps. """ + def __init__(self, num_channels, num_filters, @@ -150,18 +173,19 @@ class Conv1DCell(Conv1D): padding = receptive_field - 1 if causal else receptive_field // 2 self._receptive_field = receptive_field self.causal = causal - super(Conv1DCell, self).__init__(num_channels, - num_filters, - filter_size, - stride=1, - padding=padding, - dilation=dilation, - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) + super(Conv1DCell, self).__init__( + num_channels, + num_filters, + filter_size, + stride=1, + padding=padding, + dilation=dilation, + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) def forward(self, x): # it ensures that ouput time steps == input time steps @@ -189,15 +213,16 @@ class Conv1DCell(Conv1D): def add_input(self, x_t): batch_size, c_in, _ = x_t.shape if self._buffer is None: - self._buffer = F.zeros((batch_size, c_in, self.receptive_field), - dtype=x_t.dtype) + self._buffer = F.zeros( + (batch_size, c_in, self.receptive_field), dtype=x_t.dtype) self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1) if self._dilation[1] > 1: - input = F.strided_slice(self._buffer, - axes=[2], - starts=[0], - ends=[self.receptive_field], - strides=[self._dilation[1]]) + input = F.strided_slice( + self._buffer, + axes=[2], + starts=[0], + ends=[self.receptive_field], + strides=[self._dilation[1]]) else: input = self._buffer input = F.reshape(input, (batch_size, -1)) diff --git a/parakeet/modules/dynamic_gru.py b/parakeet/modules/dynamic_gru.py index e84c598..3a6602e 100644 --- a/parakeet/modules/dynamic_gru.py +++ b/parakeet/modules/dynamic_gru.py @@ -1,6 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers + class DynamicGRU(dg.Layer): def __init__(self, size, @@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer): res = res[::-1] res = layers.concat(res, axis=1) return res - diff --git a/parakeet/modules/ffn.py b/parakeet/modules/ffn.py index dc413bf..3fa8c16 100644 --- a/parakeet/modules/ffn.py +++ b/parakeet/modules/ffn.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers import paddle.fluid as fluid @@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D class PositionwiseFeedForward(dg.Layer): ''' A two-feed-forward-layer module ''' - def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1): + + def __init__(self, + d_in, + num_hidden, + filter_size, + padding=0, + use_cudnn=True, + dropout=0.1): super(PositionwiseFeedForward, self).__init__() self.num_hidden = num_hidden self.use_cudnn = use_cudnn self.dropout = dropout k = math.sqrt(1 / d_in) - self.w_1 = Conv1D(num_channels = d_in, - num_filters = num_hidden, - filter_size = filter_size, - padding=padding, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn) + self.w_1 = Conv1D( + num_channels=d_in, + num_filters=num_hidden, + filter_size=filter_size, + padding=padding, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn) k = math.sqrt(1 / num_hidden) - self.w_2 = Conv1D(num_channels = num_hidden, - num_filters = d_in, - filter_size = filter_size, - padding=padding, - param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn) + self.w_2 = Conv1D( + num_channels=num_hidden, + num_filters=d_in, + filter_size=filter_size, + padding=padding, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()), + bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k)), + use_cudnn=use_cudnn) self.layer_norm = dg.LayerNorm(d_in) def forward(self, input): @@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer): Returns: output (Variable), Shape(B, T, C), the result after FFN. """ - x = layers.transpose(input, [0,2,1]) + x = layers.transpose(input, [0, 2, 1]) #FFN Networt x = self.w_2(layers.relu(self.w_1(x))) - + # dropout x = layers.dropout(x, self.dropout) - x = layers.transpose(x, [0,2,1]) + x = layers.transpose(x, [0, 2, 1]) # residual connection x = x + input - + #layer normalization output = self.layer_norm(x) - return output \ No newline at end of file + return output diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index 40d8164..89783b9 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -1,37 +1,67 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import math import numpy as np import paddle.fluid as fluid import paddle.fluid.dygraph as dg import paddle.fluid.layers as layers + class Linear(dg.Layer): - def __init__(self, in_features, out_features, is_bias=True, dtype="float32"): + def __init__(self, + in_features, + out_features, + is_bias=True, + dtype="float32"): super(Linear, self).__init__() self.in_features = in_features self.out_features = out_features self.dtype = dtype - self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) - self.bias = is_bias + self.weight = fluid.ParamAttr( + initializer=fluid.initializer.XavierInitializer()) + self.bias = is_bias if is_bias is not False: k = math.sqrt(1 / in_features) - self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) + self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform( + low=-k, high=k)) + + self.linear = dg.Linear( + in_features, + out_features, + param_attr=self.weight, + bias_attr=self.bias, ) - self.linear = dg.Linear(in_features, out_features, param_attr = self.weight, - bias_attr = self.bias,) - def forward(self, x): x = self.linear(x) return x + class ScaledDotProductAttention(dg.Layer): def __init__(self, d_key): super(ScaledDotProductAttention, self).__init__() self.d_key = d_key - + # please attention this mask is diff from pytorch - def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1): + def forward(self, + key, + value, + query, + mask=None, + query_mask=None, + dropout=0.1): """ Scaled Dot Product Attention. @@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer): attention (Variable), Shape(n_head * B, T, C), the attention of key. """ # Compute attention score - attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y + attention = layers.matmul( + query, key, transpose_y=True) #transpose the last dim in y attention = attention / math.sqrt(self.d_key) # Mask key to ignore padding if mask is not None: attention = attention * mask - mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) + mask = (mask == 0).astype(np.float32) * (-2**32 + 1) attention = attention + mask - + attention = layers.softmax(attention) attention = layers.dropout(attention, dropout) - + # Mask query to ignore padding if query_mask is not None: attention = attention * query_mask - + result = layers.matmul(attention, value) return result, attention + class MultiheadAttention(dg.Layer): - def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True): + def __init__(self, + num_hidden, + d_k, + d_q, + num_head=4, + is_bias=False, + dropout=0.1, + is_concat=True): super(MultiheadAttention, self).__init__() self.num_hidden = num_hidden self.num_head = num_head @@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer): # repeat masks h times if query_mask is not None: - query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) + query_mask = layers.expand(query_mask, + [self.num_head, 1, seq_len_key]) if mask is not None: mask = layers.expand(mask, (self.num_head, 1, 1)) - - + # Make multihead attention # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) - key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) - value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k]) - query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q]) + key = layers.reshape( + self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) + value = layers.reshape( + self.value(value), + [batch_size, seq_len_key, self.num_head, self.d_k]) + query = layers.reshape( + self.query(query_input), + [batch_size, seq_len_query, self.num_head, self.d_q]) + + key = layers.reshape( + layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + value = layers.reshape( + layers.transpose(value, [2, 0, 1, 3]), + [-1, seq_len_key, self.d_k]) + query = layers.reshape( + layers.transpose(query, [2, 0, 1, 3]), + [-1, seq_len_query, self.d_q]) + + result, attention = self.scal_attn( + key, value, query, mask=mask, query_mask=query_mask) - key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) - value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) - query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q]) - - result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) - # concat all multihead result - result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) - result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) + result = layers.reshape( + result, [self.num_head, batch_size, seq_len_query, self.d_q]) + result = layers.reshape( + layers.transpose(result, [1, 2, 0, 3]), + [batch_size, seq_len_query, -1]) if self.is_concat: - result = layers.concat([query_input,result], axis=-1) + result = layers.concat([query_input, result], axis=-1) result = layers.dropout(self.fc(result), self.dropout) result = result + query_input - + result = self.layer_norm(result) - return result, attention \ No newline at end of file + return result, attention diff --git a/parakeet/modules/weight_norm.py b/parakeet/modules/weight_norm.py index 9e28792..92f1085 100644 --- a/parakeet/modules/weight_norm.py +++ b/parakeet/modules/weight_norm.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from paddle import fluid import paddle.fluid.dygraph as dg diff --git a/parakeet/utils/layer_tools.py b/parakeet/utils/layer_tools.py index eaa9c9e..a045c78 100644 --- a/parakeet/utils/layer_tools.py +++ b/parakeet/utils/layer_tools.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np from torch import nn import paddle.fluid.dygraph as dg @@ -10,8 +24,8 @@ def summary(layer): print("{}|{}|{}".format(name, param.shape, np.prod(param.shape))) num_elements += np.prod(param.shape) num_params += 1 - print("layer has {} parameters, {} elements.".format( - num_params, num_elements)) + print("layer has {} parameters, {} elements.".format(num_params, + num_elements)) def freeze(layer): @@ -31,5 +45,5 @@ def torch_summary(layer): print("{}|{}|{}".format(name, param.shape, np.prod(param.shape))) num_elements += np.prod(param.shape) num_params += 1 - print("layer has {} parameters, {} elements.".format( - num_params, num_elements)) + print("layer has {} parameters, {} elements.".format(num_params, + num_elements)) diff --git a/setup.py b/setup.py index 1cd6e8a..2384837 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,27 @@ -import os -import io +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import io import re from setuptools import setup, find_packages + def read(*names, **kwargs): with io.open( - os.path.join(os.path.dirname(__file__), *names), - encoding=kwargs.get("encoding", "utf8") - ) as fp: + os.path.join(os.path.dirname(__file__), *names), + encoding=kwargs.get("encoding", "utf8")) as fp: return fp.read() @@ -19,6 +33,7 @@ def find_version(*file_paths): return version_match.group(1) raise RuntimeError("Unable to find version string.") + VERSION = find_version('parakeet', '__init__.py') long_description = read('README.md') @@ -32,17 +47,26 @@ setup_info = dict( description='Speech synthesis tools and models based on Paddlepaddle', long_description=long_description, license='Apache 2', - install_requires=[ - 'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', - 'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy', - 'ruamel.yaml', 'pandas', 'sox', 'soundfile', + 'numpy', + 'nltk', + 'inflect', + 'librosa', + 'unidecode', + 'numba', + 'tqdm', + 'matplotlib', + 'tensorboardX', + 'tensorboard', + 'scipy', + 'ruamel.yaml', + 'pandas', + 'sox', + 'soundfile', ], # Package info packages=find_packages(exclude=('tests', 'tests.*')), + zip_safe=True, ) - zip_safe=True, -) - -setup(**setup_info) \ No newline at end of file +setup(**setup_info) diff --git a/tests/test_ljspeech.py b/tests/test_ljspeech.py index 34f5011..d6187e8 100644 --- a/tests/test_ljspeech.py +++ b/tests/test_ljspeech.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.datasets.ljspeech import LJSpeech from parakeet.data.datacargo import DataCargo diff --git a/tests/test_vctk.py b/tests/test_vctk.py index 3f7d61e..58ef0ca 100644 --- a/tests/test_vctk.py +++ b/tests/test_vctk.py @@ -1,11 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from parakeet.datasets import vctk from pathlib import Path from parakeet.data.datacargo import DataCargo root = Path("/workspace/datasets/VCTK-Corpus") vctk_dataset = vctk.VCTK(root) -vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True) +vctk_cargo = DataCargo( + vctk_dataset, batch_size=16, shuffle=True, drop_last=True) for i, batch in enumerate(vctk_cargo): print(i) - diff --git a/tools/copyright.hook b/tools/copyright.hook new file mode 100644 index 0000000..23aaf38 --- /dev/null +++ b/tools/copyright.hook @@ -0,0 +1,121 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import io, re +import sys, os +import subprocess +import platform + +COPYRIGHT = ''' +Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +LANG_COMMENT_MARK = None + +NEW_LINE_MARK = None + +COPYRIGHT_HEADER = None + +if platform.system() == "Windows": + NEW_LINE_MARK = "\r\n" +else: + NEW_LINE_MARK = '\n' + COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1] + p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0) + process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE) + date, err = process.communicate() + date = date.decode("utf-8").rstrip("\n") + COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date) + + +def generate_copyright(template, lang='C'): + if lang == 'Python': + LANG_COMMENT_MARK = '#' + else: + LANG_COMMENT_MARK = "//" + + lines = template.split(NEW_LINE_MARK) + BLANK = " " + ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK + for lino, line in enumerate(lines): + if lino == 0 or lino == 1 or lino == len(lines) - 1: continue + if len(line) == 0: + BLANK = "" + else: + BLANK = " " + ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK + + return ans + "\n" + + +def lang_type(filename): + if filename.endswith(".py"): + return "Python" + elif filename.endswith(".h"): + return "C" + elif filename.endswith(".c"): + return "C" + elif filename.endswith(".hpp"): + return "C" + elif filename.endswith(".cc"): + return "C" + elif filename.endswith(".cpp"): + return "C" + elif filename.endswith(".cu"): + return "C" + elif filename.endswith(".cuh"): + return "C" + elif filename.endswith(".go"): + return "C" + elif filename.endswith(".proto"): + return "C" + else: + print("Unsupported filetype %s", filename) + exit(0) + + +PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)") + + +def main(argv=None): + parser = argparse.ArgumentParser( + description='Checker for copyright declaration.') + parser.add_argument('filenames', nargs='*', help='Filenames to check') + args = parser.parse_args(argv) + + retv = 0 + for filename in args.filenames: + fd = io.open(filename, encoding="utf-8") + first_line = fd.readline() + second_line = fd.readline() + if "COPYRIGHT (C)" in first_line.upper(): continue + if first_line.startswith("#!") or PYTHON_ENCODE.match( + second_line) != None or PYTHON_ENCODE.match(first_line) != None: + continue + original_contents = io.open(filename, encoding="utf-8").read() + new_contents = generate_copyright( + COPYRIGHT, lang_type(filename)) + original_contents + print('Auto Insert Copyright Header {}'.format(filename)) + retv = 1 + with io.open(filename, 'w') as output_file: + output_file.write(new_contents) + + return retv + + +if __name__ == '__main__': + exit(main())