114 lines
4.3 KiB
Python
114 lines
4.3 KiB
Python
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import numpy as np
|
|
from paddle import fluid
|
|
import paddle.fluid.dygraph as dg
|
|
|
|
from hparams import hparams, hparams_debug_string
|
|
from parakeet import g2p as frontend
|
|
from deepvoice3 import DeepVoiceTTS
|
|
|
|
|
|
def dry_run(model):
|
|
"""
|
|
Run the model once, just to get it initialized.
|
|
"""
|
|
model.train()
|
|
_frontend = getattr(frontend, hparams.frontend)
|
|
batch_size = 4
|
|
enc_length = 157
|
|
snd_sample_length = 500
|
|
|
|
r = hparams.outputs_per_step
|
|
downsample_step = hparams.downsample_step
|
|
n_speakers = hparams.n_speakers
|
|
|
|
# make sure snd_sample_length can be divided by r * downsample_step
|
|
linear_shift = r * downsample_step
|
|
snd_sample_length += linear_shift - snd_sample_length % linear_shift
|
|
decoder_length = snd_sample_length // downsample_step // r
|
|
mel_length = snd_sample_length // downsample_step
|
|
|
|
n_vocab = _frontend.n_vocab
|
|
max_pos = hparams.max_positions
|
|
spker_embed = hparams.speaker_embed_dim
|
|
linear_dim = model.linear_dim
|
|
mel_dim = hparams.num_mels
|
|
|
|
x = np.random.randint(
|
|
low=0, high=n_vocab, size=(batch_size, enc_length, 1), dtype="int64")
|
|
input_lengths = np.arange(
|
|
enc_length - batch_size + 1, enc_length + 1, dtype="int64")
|
|
mel = np.random.randn(batch_size, mel_dim, 1, mel_length).astype("float32")
|
|
y = np.random.randn(batch_size, linear_dim, 1,
|
|
snd_sample_length).astype("float32")
|
|
|
|
text_positions = np.tile(
|
|
np.arange(
|
|
0, enc_length, dtype="int64"), (batch_size, 1))
|
|
text_mask = text_positions > np.expand_dims(input_lengths, 1)
|
|
text_positions[text_mask] = 0
|
|
text_positions = np.expand_dims(text_positions, axis=-1)
|
|
|
|
frame_positions = np.tile(
|
|
np.arange(
|
|
1, decoder_length + 1, dtype="int64"), (batch_size, 1))
|
|
frame_positions = np.expand_dims(frame_positions, axis=-1)
|
|
|
|
done = np.zeros(shape=(batch_size, 1, 1, decoder_length), dtype="float32")
|
|
target_lengths = np.array([snd_sample_length] * batch_size).astype("int64")
|
|
|
|
speaker_ids = np.random.randint(
|
|
low=0, high=n_speakers, size=(batch_size, 1),
|
|
dtype="int64") if n_speakers > 1 else None
|
|
|
|
ismultispeaker = speaker_ids is not None
|
|
|
|
x = dg.to_variable(x)
|
|
input_lengths = dg.to_variable(input_lengths)
|
|
mel = dg.to_variable(mel)
|
|
y = dg.to_variable(y)
|
|
text_positions = dg.to_variable(text_positions)
|
|
frame_positions = dg.to_variable(frame_positions)
|
|
done = dg.to_variable(done)
|
|
target_lengths = dg.to_variable(target_lengths)
|
|
speaker_ids = dg.to_variable(
|
|
speaker_ids) if speaker_ids is not None else None
|
|
|
|
# these two fields are used as numpy ndarray
|
|
text_lengths = input_lengths.numpy()
|
|
decoder_lengths = target_lengths.numpy() // r // downsample_step
|
|
|
|
max_seq_len = max(text_lengths.max(), decoder_lengths.max())
|
|
if max_seq_len >= hparams.max_positions:
|
|
raise RuntimeError(
|
|
"max_seq_len ({}) >= max_posision ({})\n"
|
|
"Input text or decoder targget length exceeded the maximum length.\n"
|
|
"Please set a larger value for ``max_position`` in hyper parameters."
|
|
.format(max_seq_len, hparams.max_positions))
|
|
|
|
# cause paddle's embedding layer expect shape[-1] == 1
|
|
|
|
# first dry run runs the whole model
|
|
mel_outputs, linear_outputs, attn, done_hat = model(
|
|
x, input_lengths, mel, speaker_ids, text_positions, frame_positions)
|
|
|
|
num_parameters = 0
|
|
for k, v in model.state_dict().items():
|
|
print("{}|{}|{}".format(k, v.shape, np.prod(v.shape)))
|
|
num_parameters += np.prod(v.shape)
|
|
print("now model has {} parameters".format(len(model.state_dict())))
|
|
print("now model has {} elements".format(num_parameters))
|