ParakeetRebeccaRosario/parakeet/models/deepvoice3/dry_run.py

114 lines
4.3 KiB
Python

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
import paddle.fluid.dygraph as dg
from hparams import hparams, hparams_debug_string
from parakeet import g2p as frontend
from deepvoice3 import DeepVoiceTTS
def dry_run(model):
"""
Run the model once, just to get it initialized.
"""
model.train()
_frontend = getattr(frontend, hparams.frontend)
batch_size = 4
enc_length = 157
snd_sample_length = 500
r = hparams.outputs_per_step
downsample_step = hparams.downsample_step
n_speakers = hparams.n_speakers
# make sure snd_sample_length can be divided by r * downsample_step
linear_shift = r * downsample_step
snd_sample_length += linear_shift - snd_sample_length % linear_shift
decoder_length = snd_sample_length // downsample_step // r
mel_length = snd_sample_length // downsample_step
n_vocab = _frontend.n_vocab
max_pos = hparams.max_positions
spker_embed = hparams.speaker_embed_dim
linear_dim = model.linear_dim
mel_dim = hparams.num_mels
x = np.random.randint(
low=0, high=n_vocab, size=(batch_size, enc_length, 1), dtype="int64")
input_lengths = np.arange(
enc_length - batch_size + 1, enc_length + 1, dtype="int64")
mel = np.random.randn(batch_size, mel_dim, 1, mel_length).astype("float32")
y = np.random.randn(batch_size, linear_dim, 1,
snd_sample_length).astype("float32")
text_positions = np.tile(
np.arange(
0, enc_length, dtype="int64"), (batch_size, 1))
text_mask = text_positions > np.expand_dims(input_lengths, 1)
text_positions[text_mask] = 0
text_positions = np.expand_dims(text_positions, axis=-1)
frame_positions = np.tile(
np.arange(
1, decoder_length + 1, dtype="int64"), (batch_size, 1))
frame_positions = np.expand_dims(frame_positions, axis=-1)
done = np.zeros(shape=(batch_size, 1, 1, decoder_length), dtype="float32")
target_lengths = np.array([snd_sample_length] * batch_size).astype("int64")
speaker_ids = np.random.randint(
low=0, high=n_speakers, size=(batch_size, 1),
dtype="int64") if n_speakers > 1 else None
ismultispeaker = speaker_ids is not None
x = dg.to_variable(x)
input_lengths = dg.to_variable(input_lengths)
mel = dg.to_variable(mel)
y = dg.to_variable(y)
text_positions = dg.to_variable(text_positions)
frame_positions = dg.to_variable(frame_positions)
done = dg.to_variable(done)
target_lengths = dg.to_variable(target_lengths)
speaker_ids = dg.to_variable(
speaker_ids) if speaker_ids is not None else None
# these two fields are used as numpy ndarray
text_lengths = input_lengths.numpy()
decoder_lengths = target_lengths.numpy() // r // downsample_step
max_seq_len = max(text_lengths.max(), decoder_lengths.max())
if max_seq_len >= hparams.max_positions:
raise RuntimeError(
"max_seq_len ({}) >= max_posision ({})\n"
"Input text or decoder targget length exceeded the maximum length.\n"
"Please set a larger value for ``max_position`` in hyper parameters."
.format(max_seq_len, hparams.max_positions))
# cause paddle's embedding layer expect shape[-1] == 1
# first dry run runs the whole model
mel_outputs, linear_outputs, attn, done_hat = model(
x, input_lengths, mel, speaker_ids, text_positions, frame_positions)
num_parameters = 0
for k, v in model.state_dict().items():
print("{}|{}|{}".format(k, v.shape, np.prod(v.shape)))
num_parameters += np.prod(v.shape)
print("now model has {} parameters".format(len(model.state_dict())))
print("now model has {} elements".format(num_parameters))