From 9b8fd9f93d094ef16230b4049de3305a1a2b2604 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 22 May 2020 07:16:45 +0000 Subject: [PATCH 01/10] Upgrade waveflow to 1.8.0 --- parakeet/models/waveflow/waveflow_modules.py | 2 +- parakeet/modules/weight_norm.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py index 51f9108..03f873b 100644 --- a/parakeet/models/waveflow/waveflow_modules.py +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -348,7 +348,7 @@ class WaveFlowModule(dg.Layer): mel = self.conditioner(mel) assert mel.shape[2] >= audio.shape[1] # Prune out the tail of audio/mel so that time/n_group == 0. - pruned_len = audio.shape[1] // self.n_group * self.n_group + pruned_len = int(audio.shape[1] // self.n_group * self.n_group) if audio.shape[1] > pruned_len: audio = audio[:, :pruned_len] diff --git a/parakeet/modules/weight_norm.py b/parakeet/modules/weight_norm.py index 7f68cd9..82203d6 100644 --- a/parakeet/modules/weight_norm.py +++ b/parakeet/modules/weight_norm.py @@ -87,7 +87,14 @@ def compute_l2_normalized_weight(v, g, dim): def compute_weight(v, g, dim, power): assert len(g.shape) == 1, "magnitude should be a vector" if power == 2: - return compute_l2_normalized_weight(v, g, dim) + in_dtype = v.dtype + if in_dtype == fluid.core.VarDesc.VarType.FP16: + v = F.cast(v, "float32") + g = F.cast(g, "float32") + weight = compute_l2_normalized_weight(v, g, dim) + if in_dtype == fluid.core.VarDesc.VarType.FP16: + weight = F.cast(weight, "float16") + return weight else: v_normalized = F.elementwise_div( v, (norm_except(v, dim, power) + 1e-12), axis=dim) From 74266afc2b5b081c129a5a1390a5dfcfe137cf8c Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 25 May 2020 17:08:52 +0800 Subject: [PATCH 02/10] dv3: set p_replace_pronunciation to 0 at evaluation --- examples/deepvoice3/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deepvoice3/utils.py b/examples/deepvoice3/utils.py index a0e8c7d..c600f5b 100644 --- a/examples/deepvoice3/utils.py +++ b/examples/deepvoice3/utils.py @@ -60,7 +60,7 @@ def add_options(parser): def make_evaluator(config, text_sequences, output_dir, writer=None): c = config["transform"] - p_replace = c["replace_pronunciation_prob"] + p_replace = 0.0 sample_rate = c["sample_rate"] preemphasis = c["preemphasis"] win_length = c["win_length"] From aa8e4ea0a8b741b757a523b8aeae6e0ed1ee17a1 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 27 May 2020 05:24:39 +0000 Subject: [PATCH 03/10] fix README for clarinet, pin numba and tqdm verison --- examples/clarinet/README.md | 4 ++-- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/clarinet/README.md b/examples/clarinet/README.md index ca74b2d..cb02475 100644 --- a/examples/clarinet/README.md +++ b/examples/clarinet/README.md @@ -129,7 +129,7 @@ Example script: ```bash python synthesis.py \ - --config=./configs/wavenet_single_gaussian.yaml \ + --config=./configs/clarinet_ljspeech.yaml \ --data=./LJSpeech-1.1/ \ --device=0 \ --iteration=500000 \ @@ -140,7 +140,7 @@ or ```bash python synthesis.py \ - --config=./configs/wavenet_single_gaussian.yaml \ + --config=./configs/clarinet_ljspeech.yaml \ --data=./LJSpeech-1.1/ \ --device=0 \ --checkpoint="experiment/checkpoints/step-500000" \ diff --git a/setup.py b/setup.py index 062a02d..244eef8 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,8 @@ setup_info = dict( 'inflect', 'librosa', 'unidecode', - 'numba', - 'tqdm', + 'numba==0.48.0', + 'tqdm==4.19.8', 'matplotlib', 'tensorboardX', 'tensorboard', From 9ba26facf0d51e4c58265e9dddf2de31507753e5 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 2 Jun 2020 07:07:10 +0000 Subject: [PATCH 04/10] deep coice 3: use np.int64 explicitly --- examples/deepvoice3/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deepvoice3/utils.py b/examples/deepvoice3/utils.py index c600f5b..20e3219 100644 --- a/examples/deepvoice3/utils.py +++ b/examples/deepvoice3/utils.py @@ -121,7 +121,7 @@ class Evaluator(object): en.text_to_sequence( text, p=self.p_replace), dtype=np.int64) length = len(text) - text_positions = np.arange(1, 1 + length) + text_positions = np.arange(1, 1 + length, dtype=np.int64) text = np.expand_dims(text, 0) text_positions = np.expand_dims(text_positions, 0) From a49e0c6883f14ece835ad6903d57e3f35b07087a Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 3 Jun 2020 15:14:49 +0800 Subject: [PATCH 05/10] Release tts model ckpts with griffin-lim --- README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7bb380e..8058c0e 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ Entries to the introduction, and the launch of training and synthsis for differe ## Pre-trained models and audio samples -Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model. +Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model. Click each model name to download, then you can get the compressed package which contains the pre-trained model and the `yaml` config describing how the model is trained. #### Vocoders @@ -174,29 +174,77 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels, #### TTS models +We also provide checkpoints for the different end-to-end TTS models, and present the synthesized audio examples for some randomly chosen famous quotes. The corresponding texts are displayed as follows. + +||Text | From | +|:-:|:-- | :--: | +0|*Life was like a box of chocolates, you never know what you're gonna get.* | *Forrest Gump* | +1|*With great power there must come great responsibility.* | *Spider-Man*| +2|*To be or not to be, that’s a question.*|*Hamlet*| +3|*Death is just a part of life, something we're all destined to do.*| *Forrest Gump*| +4|*Don’t argue with the people of strong determination, because they may change the fact!*| *William Shakespeare* | + +Users have the option to use different vocoders to convert mel spectrogams to raw audios in TTS models. Taking this into account, we are going to release the checkpoints for TTS models adapted to different vocoders, including the [Griffin-Lim](https://ieeexplore.ieee.org/document/1164317) algorithm and some neural vocoders. + +##### 1) Griffin-Lim +
- - + + + + @@ -204,8 +252,9 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels,
- Deep Voice 3 + + Deep Voice 3 - Transformer TTS + Transformer TTS
+ FastSpeech +
LJSpeech LJSpeech LJSpeech
- To be added soon + +
+ +
+ +
+ +
+ +
- To be added soon + +
+ +
+ +
+ +
+ + +
+ +
+ +
+ +
+ +
+ +
-Click each link to download, then you can get the compressed package which contains the pre-trained model and the `yaml` config describing how to train the model. +##### 2) Neural vocoders +under preparation ## Copyright and License From 6bfee95f2b9748892cfab2014f5628426817666b Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 3 Jun 2020 15:39:01 +0800 Subject: [PATCH 06/10] Release tts model ckpts with griffin-lim --- README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7bb380e..4e84664 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ Entries to the introduction, and the launch of training and synthsis for differe ## Pre-trained models and audio samples -Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model. +Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model. Click each model name to download, then you can get the compressed package which contains the pre-trained model and the `yaml` config describing how the model is trained. #### Vocoders @@ -174,29 +174,77 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels, #### TTS models +We also provide checkpoints for different end-to-end TTS models, and present the synthesized audio examples for some randomly chosen famous quotes. The corresponding texts are displayed as follows. + +||Text | From | +|:-:|:-- | :--: | +0|*Life was like a box of chocolates, you never know what you're gonna get.* | *Forrest Gump* | +1|*With great power there must come great responsibility.* | *Spider-Man*| +2|*To be or not to be, that’s a question.*|*Hamlet*| +3|*Death is just a part of life, something we're all destined to do.*| *Forrest Gump*| +4|*Don’t argue with the people of strong determination, because they may change the fact!*| *William Shakespeare* | + +Users have the option to use different vocoders to convert the linear/mel spectrogam to the raw audio in TTS models. Taking this into account, we are going to release the checkpoints for TTS models adapted to different vocoders, including the [Griffin-Lim](https://ieeexplore.ieee.org/document/1164317) algorithm and some neural vocoders. + +##### 1) Griffin-Lim +
- - + + + + @@ -204,8 +252,9 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels,
- Deep Voice 3 + + Deep Voice 3 - Transformer TTS + Transformer TTS
+ FastSpeech +
LJSpeech LJSpeech LJSpeech
- To be added soon + +
+ +
+ +
+ +
+ +
- To be added soon + +
+ +
+ +
+ +
+ + +
+ +
+ +
+ +
+ +
+ +
-Click each link to download, then you can get the compressed package which contains the pre-trained model and the `yaml` config describing how to train the model. +##### 2) Neural vocoders +under preparation ## Copyright and License From 33ed693ccff88935a62739d564c67efa8a1a4a5f Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 12 Jun 2020 08:45:55 +0000 Subject: [PATCH 07/10] Upgrade waveflow api to 1.8.2 --- README.md | 8 ++++---- parakeet/models/waveflow/waveflow_modules.py | 12 +----------- setup.py | 2 +- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 4e84664..812c8a6 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ sudo apt-get install libsndfile1 ### Install PaddlePaddle -See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **1.8.0** or above. +See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **1.8.2** or above. ### Install Parakeet @@ -177,7 +177,7 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels, We also provide checkpoints for different end-to-end TTS models, and present the synthesized audio examples for some randomly chosen famous quotes. The corresponding texts are displayed as follows. ||Text | From | -|:-:|:-- | :--: | +|:-:|:-- | :--: | 0|*Life was like a box of chocolates, you never know what you're gonna get.* | *Forrest Gump* | 1|*With great power there must come great responsibility.* | *Spider-Man*| 2|*To be or not to be, that’s a question.*|*Hamlet*| @@ -232,7 +232,7 @@ Users have the option to use different vocoders to convert the linear/mel spectr
- + @@ -244,7 +244,7 @@ Users have the option to use different vocoders to convert the linear/mel spectr
- + diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py index 03f873b..31b29dc 100644 --- a/parakeet/models/waveflow/waveflow_modules.py +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -79,7 +79,7 @@ class Conditioner(dg.Layer): stride=(1, s), param_attr=param_attr, bias_attr=bias_attr, - dtype="float32") + dtype=dtype) self.upsample_conv2d.append(conv_trans2d) for i, layer in enumerate(self.upsample_conv2d): @@ -88,12 +88,7 @@ class Conditioner(dg.Layer): def forward(self, x): x = fluid.layers.unsqueeze(x, 1) for layer in self.upsample_conv2d: - in_dtype = x.dtype - if in_dtype == fluid.core.VarDesc.VarType.FP16: - x = fluid.layers.cast(x, "float32") x = layer(x) - if in_dtype == fluid.core.VarDesc.VarType.FP16: - x = fluid.layers.cast(x, "float16") x = fluid.layers.leaky_relu(x, alpha=0.4) return fluid.layers.squeeze(x, [1]) @@ -101,12 +96,7 @@ class Conditioner(dg.Layer): def infer(self, x): x = fluid.layers.unsqueeze(x, 1) for layer in self.upsample_conv2d: - in_dtype = x.dtype - if in_dtype == fluid.core.VarDesc.VarType.FP16: - x = fluid.layers.cast(x, "float32") x = layer(x) - if in_dtype == fluid.core.VarDesc.VarType.FP16: - x = fluid.layers.cast(x, "float16") # Trim conv artifacts. time_cutoff = layer._filter_size[1] - layer._stride[1] x = fluid.layers.leaky_relu(x[:, :, :, :-time_cutoff], alpha=0.4) diff --git a/setup.py b/setup.py index 244eef8..693534b 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ setup_info = dict( 'inflect', 'librosa', 'unidecode', - 'numba==0.48.0', + 'numba==0.47.0', 'tqdm==4.19.8', 'matplotlib', 'tensorboardX', From 45af3a43b22b6a820fc28ec6842d40d8468c7913 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 12 Jun 2020 10:01:22 +0000 Subject: [PATCH 08/10] fix WeightNormWrapper, stop using CacheDataset for deep voice 3, pin numba version to 0.47.0 --- examples/deepvoice3/data.py | 2 +- parakeet/modules/weight_norm.py | 21 +++++++++++---------- setup.py | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/deepvoice3/data.py b/examples/deepvoice3/data.py index 8ab2bd3..8da3dfb 100644 --- a/examples/deepvoice3/data.py +++ b/examples/deepvoice3/data.py @@ -230,7 +230,7 @@ def make_data_loader(data_root, config): ref_level_db=c["ref_level_db"], max_norm=c["max_norm"], clip_norm=c["clip_norm"]) - ljspeech = CacheDataset(TransformDataset(meta, transform)) + ljspeech = TransformDataset(meta, transform) # use meta data's text length as a sort key for the sampler batch_size = config["train"]["batch_size"] diff --git a/parakeet/modules/weight_norm.py b/parakeet/modules/weight_norm.py index 82203d6..51732a7 100644 --- a/parakeet/modules/weight_norm.py +++ b/parakeet/modules/weight_norm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np from paddle import fluid import paddle.fluid.dygraph as dg import paddle.fluid.layers as F @@ -44,10 +43,10 @@ def norm_except(param, dim, power): if dim is None: return norm(param, dim, power) elif dim == 0: - param_matrix = F.reshape(param, (shape[0], np.prod(shape[1:]))) + param_matrix = F.reshape(param, (shape[0], -1)) return norm(param_matrix, dim=1, power=power) elif dim == -1 or dim == ndim - 1: - param_matrix = F.reshape(param, (np.prod(shape[:-1]), shape[-1])) + param_matrix = F.reshape(param, (-1, shape[-1])) return norm(param_matrix, dim=0, power=power) else: perm = list(range(ndim)) @@ -62,24 +61,26 @@ def compute_l2_normalized_weight(v, g, dim): ndim = len(shape) if dim is None: - v_normalized = v / (F.reduce_sum(F.square(v)) + 1e-12) + v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12) elif dim == 0: - param_matrix = F.reshape(v, (shape[0], np.prod(shape[1:]))) + param_matrix = F.reshape(v, (shape[0], -1)) v_normalized = F.l2_normalize(param_matrix, axis=1) + v_normalized = F.reshape(v_normalized, shape) elif dim == -1 or dim == ndim - 1: - param_matrix = F.reshape(v, (np.prod(shape[:-1]), shape[-1])) + param_matrix = F.reshape(v, (-1, shape[-1])) v_normalized = F.l2_normalize(param_matrix, axis=0) + v_normalized = F.reshape(v_normalized, shape) else: perm = list(range(ndim)) perm[0] = dim perm[dim] = 0 transposed_param = F.transpose(v, perm) - param_matrix = F.reshape( - transposed_param, - (transposed_param.shape[0], np.prod(transposed_param.shape[1:]))) + transposed_shape = transposed_param.shape + param_matrix = F.reshape(transposed_param, + (transposed_param.shape[0], -1)) v_normalized = F.l2_normalize(param_matrix, axis=1) + v_normalized = F.reshape(v_normalized, transposed_shape) v_normalized = F.transpose(v_normalized, perm) - v_normalized = F.reshape(v_normalized, shape) weight = F.elementwise_mul(v_normalized, g, axis=dim) return weight diff --git a/setup.py b/setup.py index 244eef8..693534b 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ setup_info = dict( 'inflect', 'librosa', 'unidecode', - 'numba==0.48.0', + 'numba==0.47.0', 'tqdm==4.19.8', 'matplotlib', 'tensorboardX', From 9dad6c3d41cd7e9618b0d0edf1fd8f55103e7eaa Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 12 Jun 2020 10:13:27 +0000 Subject: [PATCH 09/10] fix synthesis for transformerTTS and FastSpeech, use int64 explicitly --- examples/fastspeech/synthesis.py | 4 ++-- examples/transformer_tts/synthesis.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/fastspeech/synthesis.py b/examples/fastspeech/synthesis.py index de726bd..81b55c5 100644 --- a/examples/fastspeech/synthesis.py +++ b/examples/fastspeech/synthesis.py @@ -83,8 +83,8 @@ def synthesis(text_input, args): pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) - text = dg.to_variable(text) - pos_text = dg.to_variable(pos_text) + text = dg.to_variable(text).astype(np.int64) + pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) diff --git a/examples/transformer_tts/synthesis.py b/examples/transformer_tts/synthesis.py index 7d7f965..9a1b0e8 100644 --- a/examples/transformer_tts/synthesis.py +++ b/examples/transformer_tts/synthesis.py @@ -92,15 +92,17 @@ def synthesis(text_input, args): model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) - text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) + text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) - pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) + pos_text = fluid.layers.unsqueeze( + dg.to_variable(pos_text).astype(np.int64), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: pos_mel = np.arange(1, mel_input.shape[1] + 1) - pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) + pos_mel = fluid.layers.unsqueeze( + dg.to_variable(pos_mel).astype(np.int64), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat( From 91b1a3af5b6717d6294454b125ad146241d9b19b Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 12 Jun 2020 12:07:36 +0000 Subject: [PATCH 10/10] pin llvmlite version to 0.31.0 --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 244eef8..6061408 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,6 @@ import os import io import re -import six import sys from setuptools import setup, find_packages @@ -65,7 +64,7 @@ setup_info = dict( 'pandas', 'sox', 'soundfile', - 'llvmlite==0.31.0' if sys.version_info < (3, 6) else "llvmlite", + 'llvmlite==0.31.0', ], # Package info