diff --git a/examples/speedyspeech/baker/synthesize.py b/examples/speedyspeech/baker/synthesize.py index 8ca686c..262908f 100644 --- a/examples/speedyspeech/baker/synthesize.py +++ b/examples/speedyspeech/baker/synthesize.py @@ -28,6 +28,8 @@ import paddle from paddle import nn from paddle.nn import functional as F from paddle import distributed as dist +from paddle import jit +from paddle.static import InputSpec from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable @@ -62,16 +64,35 @@ def evaluate(args, speedyspeech_config, pwg_config): mu = paddle.to_tensor(mu) std = paddle.to_tensor(std) speedyspeech_normalizer = ZScore(mu, std) + speedyspeech_normalizer.eval() stat = np.load(args.pwg_stat) mu, std = stat mu = paddle.to_tensor(mu) std = paddle.to_tensor(std) pwg_normalizer = ZScore(mu, std) + pwg_normalizer.eval() + + speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer, + model) + speedyspeech_inference.eval() + speedyspeech_inference = jit.to_static( + speedyspeech_inference, + input_spec=[ + InputSpec( + [-1], dtype=paddle.int64), InputSpec( + [-1], dtype=paddle.int64) + ]) + paddle.jit.save(speedyspeech_inference, + os.path.join(args.inference_dir, "speedyspeech")) - speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer, - model) pwg_inference = PWGInference(pwg_normalizer, vocoder) + pwg_inference.eval() + pwg_inference = jit.to_static( + pwg_inference, + input_spec=[InputSpec( + [-1, 80], dtype=paddle.float32), ]) + paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg")) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -82,7 +103,7 @@ def evaluate(args, speedyspeech_config, pwg_config): tones = paddle.to_tensor(datum["tones"]) with paddle.no_grad(): - wav = pwg_inference(speedyspeech_inferencce(phones, tones)) + wav = pwg_inference(speedyspeech_inference(phones, tones)) sf.write( output_dir / (utt_id + ".wav"), wav.numpy(), @@ -97,7 +118,7 @@ def main(): parser.add_argument( "--speedyspeech-config", type=str, - help="config file to overwrite default config") + help="config file for speedyspeech.") parser.add_argument( "--speedyspeech-checkpoint", type=str, @@ -108,10 +129,7 @@ def main(): help="mean and standard deviation used to normalize spectrogram when training speedyspeech." ) parser.add_argument( - "--pwg-config", - type=str, - help="mean and standard deviation used to normalize spectrogram when training speedyspeech." - ) + "--pwg-config", type=str, help="config file for parallelwavegan.") parser.add_argument( "--pwg-params", type=str, @@ -123,6 +141,8 @@ def main(): ) parser.add_argument("--test-metadata", type=str, help="test metadata") parser.add_argument("--output-dir", type=str, help="output dir") + parser.add_argument( + "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( "--device", type=str, default="gpu", help="device type to use") parser.add_argument("--verbose", type=int, default=1, help="verbose") diff --git a/examples/speedyspeech/baker/synthesize.sh b/examples/speedyspeech/baker/synthesize.sh index e464505..18f056d 100644 --- a/examples/speedyspeech/baker/synthesize.sh +++ b/examples/speedyspeech/baker/synthesize.sh @@ -7,4 +7,5 @@ python synthesize.py \ --pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ --output-dir=exp/debug/test \ + --inference-dir=exp/debug/inference \ --device="gpu" diff --git a/examples/speedyspeech/baker/synthesize_e2e.py b/examples/speedyspeech/baker/synthesize_e2e.py index 9406982..186230c 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.py +++ b/examples/speedyspeech/baker/synthesize_e2e.py @@ -25,6 +25,8 @@ import paddle import numpy as np import soundfile as sf import paddle +from paddle import jit +from paddle.static import InputSpec from paddle import nn from paddle.nn import functional as F from paddle import distributed as dist @@ -72,9 +74,26 @@ def evaluate(args, speedyspeech_config, pwg_config): std = paddle.to_tensor(std) pwg_normalizer = ZScore(mu, std) - speedyspeech_inferencce = SpeedySpeechInference(speedyspeech_normalizer, - model) + speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer, + model) + speedyspeech_inference.eval() + speedyspeech_inference = jit.to_static( + speedyspeech_inference, + input_spec=[ + InputSpec( + [-1], dtype=paddle.int64), InputSpec( + [-1], dtype=paddle.int64) + ]) + paddle.jit.save(speedyspeech_inference, + os.path.join(args.inference_dir, "speedyspeech")) + pwg_inference = PWGInference(pwg_normalizer, vocoder) + pwg_inference.eval() + pwg_inference = jit.to_static( + pwg_inference, + input_spec=[InputSpec( + [-1, 80], dtype=paddle.float32), ]) + paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg")) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -83,7 +102,7 @@ def evaluate(args, speedyspeech_config, pwg_config): phones, tones = text_analysis(sentence) with paddle.no_grad(): - wav = pwg_inference(speedyspeech_inferencce(phones, tones)) + wav = pwg_inference(speedyspeech_inference(phones, tones)) sf.write( output_dir / (utt_id + ".wav"), wav.numpy(), @@ -98,7 +117,7 @@ def main(): parser.add_argument( "--speedyspeech-config", type=str, - help="config file to overwrite default config") + help="config file for speedyspeech.") parser.add_argument( "--speedyspeech-checkpoint", type=str, @@ -109,10 +128,7 @@ def main(): help="mean and standard deviation used to normalize spectrogram when training speedyspeech." ) parser.add_argument( - "--pwg-config", - type=str, - help="mean and standard deviation used to normalize spectrogram when training speedyspeech." - ) + "--pwg-config", type=str, help="config file for parallelwavegan.") parser.add_argument( "--pwg-params", type=str, @@ -127,6 +143,8 @@ def main(): type=str, help="text to synthesize, a 'utt_id sentence' pair per line") parser.add_argument("--output-dir", type=str, help="output dir") + parser.add_argument( + "--inference-dir", type=str, help="dir to save inference models") parser.add_argument( "--device", type=str, default="gpu", help="device type to use") parser.add_argument("--verbose", type=int, default=1, help="verbose") diff --git a/examples/speedyspeech/baker/synthesize_e2e.sh b/examples/speedyspeech/baker/synthesize_e2e.sh index a12cc16..baf4ef3 100644 --- a/examples/speedyspeech/baker/synthesize_e2e.sh +++ b/examples/speedyspeech/baker/synthesize_e2e.sh @@ -6,5 +6,6 @@ python synthesize_e2e.py \ --pwg-params=../../parallelwave_gan/baker/converted.pdparams \ --pwg-stat=../../parallelwave_gan/baker/dump/train/stats.npy \ --text=sentences.txt \ - --output-dir=exp/e2e \ + --output-dir=exp/debug/e2e \ + --inference-dir=exp/debug/inference \ --device="gpu" diff --git a/parakeet/models/parallel_wavegan.py b/parakeet/models/parallel_wavegan.py index eae8112..cd4539f 100644 --- a/parakeet/models/parallel_wavegan.py +++ b/parakeet/models/parallel_wavegan.py @@ -44,7 +44,7 @@ class Stretch2D(nn.Layer): self.h_scale = h_scale self.mode = mode - def forward(self, x: Tensor) -> Tensor: + def forward(self, x): """ Parameters ---------- @@ -115,7 +115,7 @@ class UpsampleNet(nn.Layer): nn, nonlinear_activation)(**nonlinear_activation_params) self.up_layers.append(nonlinear) - def forward(self, c: Tensor) -> Tensor: + def forward(self, c): """ Parameters ---------- @@ -197,7 +197,7 @@ class ConvInUpsampleNet(nn.Layer): freq_axis_kernel_size=freq_axis_kernel_size, use_causal_conv=use_causal_conv) - def forward(self, c: Tensor) -> Tensor: + def forward(self, c): """ Parameters ---------- @@ -283,7 +283,7 @@ class ResidualBlock(nn.Layer): self.conv1x1_skip = nn.Conv1D( gate_out_channels, skip_channels, kernel_size=1, bias_attr=bias) - def forward(self, x: Tensor, c: Tensor) -> Tuple[Tensor, Tensor]: + def forward(self, x, c): """ Parameters ---------- @@ -439,7 +439,7 @@ class PWGGenerator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x: Tensor, c: Tensor) -> Tensor: + def forward(self, x, c): """Generate waveform. Parameters @@ -492,8 +492,7 @@ class PWGGenerator(nn.Layer): self.apply(_remove_weight_norm) - def inference(self, c: Optional[Tensor]=None, - x: Optional[Tensor]=None) -> Tensor: + def inference(self, c=None): """Waveform generation. This function is used for single instance inference. @@ -510,17 +509,11 @@ class PWGGenerator(nn.Layer): Tensor Shape (T, C_out), the generated waveform """ - if x is not None: - x = paddle.transpose(x, [1, 0]).unsqueeze(0) # pseudo batch - else: - assert c is not None - x = paddle.randn( - [1, self.in_channels, c.shape[0] * self.upsample_factor]) - - if c is not None: - c = paddle.transpose(c, [1, 0]).unsqueeze(0) # pseudo batch - c = nn.Pad1D(self.aux_context_window, mode='replicate')(c) - out = self.forward(x, c).squeeze(0).transpose([1, 0]) + x = paddle.randn( + [1, self.in_channels, paddle.shape(c)[0] * self.upsample_factor]) + c = paddle.transpose(c, [1, 0]).unsqueeze(0) # pseudo batch + c = nn.Pad1D(self.aux_context_window, mode='replicate')(c) + out = self(x, c).squeeze(0).transpose([1, 0]) return out @@ -603,7 +596,7 @@ class PWGDiscriminator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x: Tensor) -> Tensor: + def forward(self, x): """ Parameters ---------- @@ -730,7 +723,7 @@ class ResidualPWGDiscriminator(nn.Layer): if use_weight_norm: self.apply_weight_norm() - def forward(self, x: Tensor) -> Tensor: + def forward(self, x): """ Parameters ---------- diff --git a/parakeet/models/speedyspeech.py b/parakeet/models/speedyspeech.py index e21ecfd..bd7055b 100644 --- a/parakeet/models/speedyspeech.py +++ b/parakeet/models/speedyspeech.py @@ -205,7 +205,19 @@ class SpeedySpeech(nn.Layer): pred_durations = self.duration_predictor(encodings) # (1, T) durations_to_expand = paddle.round(pred_durations.exp()) durations_to_expand = (durations_to_expand).astype(paddle.int64) - encodings = expand(encodings, durations_to_expand) + + slens = paddle.sum(durations_to_expand, -1) # [1] + t_dec = slens[0] # [1] + t_enc = paddle.shape(pred_durations)[-1] + M = paddle.zeros([1, t_dec, t_enc]) + + k = paddle.full([1], 0, dtype=paddle.int64) + for j in range(t_enc): + d = durations_to_expand[0, j] + M[0, k:k + d, j] = 1 + k += d + + encodings = paddle.matmul(M, encodings) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2] diff --git a/parakeet/modules/normalizer.py b/parakeet/modules/normalizer.py index d8d81b4..176741b 100644 --- a/parakeet/modules/normalizer.py +++ b/parakeet/modules/normalizer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle from paddle import nn @@ -23,7 +24,11 @@ class ZScore(nn.Layer): self.register_buffer("sigma", sigma) def forward(self, x): - return (x - self.mu) / self.sigma + # NOTE: to be compatible with paddle's to_static, we must explicitly + # call multiply, or add, etc, instead of +-*/, etc. + return paddle.divide(paddle.subtract(x, self.mu), self.sigma) def inverse(self, x): - return x * self.sigma + self.mu + # NOTE: to be compatible with paddle's to_static, we must explicitly + # call multiply, or add, etc, instead of +-*/, etc. + return paddle.add(paddle.multiply(x, self.sigma), self.mu) diff --git a/parakeet/modules/positional_encoding.py b/parakeet/modules/positional_encoding.py index d4bdea5..919af10 100644 --- a/parakeet/modules/positional_encoding.py +++ b/parakeet/modules/positional_encoding.py @@ -26,10 +26,12 @@ def sinusoid_position_encoding(num_positions: int, feature_size: int, omega: float=1.0, start_pos: int=0, - dtype=None) -> Tensor: + dtype=None) -> paddle.Tensor: # return tensor shape (num_positions, feature_size) - if (feature_size % 2 != 0): - raise ValueError("size should be divisible by 2") + # NOTE: to be compatible with paddle's to_static, we cannnot raise + # an exception here, take care of it by yourself + # if (feature_size % 2 != 0): + # raise ValueError("size should be divisible by 2") dtype = dtype or paddle.get_default_dtype() channel = paddle.arange(0, feature_size, 2, dtype=dtype) diff --git a/tests/test_raise.py b/tests/test_raise.py new file mode 100644 index 0000000..a4a5e70 --- /dev/null +++ b/tests/test_raise.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numpy as np + +import paddle +from paddle import Tensor +from paddle.static import InputSpec +from paddle.nn import functional as F + + +def sinusoid_position_encoding(num_positions: int, + feature_size: int, + omega: float=1.0, + start_pos: int=0, + dtype=None) -> paddle.Tensor: + # return tensor shape (num_positions, feature_size) + + if (feature_size % 2 != 0): + raise ValueError("size should be divisible by 2") + dtype = dtype or paddle.get_default_dtype() + + channel = paddle.arange(0, feature_size, 2, dtype=dtype) + index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype) + p = (paddle.unsqueeze(index, -1) * + omega) / (10000.0**(channel / float(feature_size))) + encodings = paddle.zeros([num_positions, feature_size], dtype=dtype) + encodings[:, 0::2] = paddle.sin(p) + encodings[:, 1::2] = paddle.cos(p) + return encodings + + +def call_it(x): + shape = paddle.shape(x) + a = shape[0] + b = shape[1] + c = sinusoid_position_encoding(a, b) + return c + + +call_it(paddle.randn([8, 32])) +m = paddle.jit.to_static( + call_it, input_spec=[InputSpec( + [-1, -1], dtype=paddle.int32)]) +m(paddle.randn([8, 32]).astype(paddle.int32)) diff --git a/tests/test_to_static.py b/tests/test_to_static.py index 7695eca..251d492 100644 --- a/tests/test_to_static.py +++ b/tests/test_to_static.py @@ -15,7 +15,8 @@ import math import paddle -from paddle.jit import to_static +from paddle import nn +from paddle.jit import to_static, save from paddle.static import InputSpec @@ -32,3 +33,33 @@ def test_applicative_evaluation(): print(x) print(y) + + +def test_nested_sequential(): + class Net(nn.Layer): + def __init__(self): + super().__init__() + group1 = nn.Sequential( + nn.Linear(2, 3), + nn.Sigmoid(), ) + group2 = nn.Sequential( + nn.Sequential(nn.Linear(3, 3)), + nn.Linear(3, 4), + nn.ReLU(), ) + self.layers = nn.Sequential(group1, group2) + + def forward(self, x): + return self.layers(x) + + net = Net() + x = paddle.randn([4, 2]) + y = net(x) + print(y) + + subgraph = to_static(net, input_spec=[InputSpec([-1, 2])]) + paddle.jit.save(subgraph, './temp_test_to_static') + + fn = paddle.jit.load('./temp_test_to_static') + y = fn(x) + + print(y)