138 lines
6.0 KiB
Python
138 lines
6.0 KiB
Python
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from deepvoice3 import DeepVoiceTTS, ConvSpec, WindowRange
|
|
|
|
|
|
def deepvoice3(n_vocab,
|
|
embed_dim=256,
|
|
mel_dim=80,
|
|
linear_dim=513,
|
|
r=4,
|
|
downsample_step=1,
|
|
n_speakers=1,
|
|
speaker_dim=16,
|
|
padding_idx=0,
|
|
dropout=(1 - 0.96),
|
|
filter_size=5,
|
|
encoder_channels=128,
|
|
decoder_channels=256,
|
|
converter_channels=256,
|
|
query_position_rate=1.0,
|
|
key_position_rate=1.29,
|
|
use_memory_mask=False,
|
|
trainable_positional_encodings=False,
|
|
force_monotonic_attention=True,
|
|
use_decoder_state_for_postnet_input=True,
|
|
max_positions=512,
|
|
embedding_weight_std=0.1,
|
|
speaker_embedding_weight_std=0.01,
|
|
freeze_embedding=False,
|
|
window_range=WindowRange(-1, 3),
|
|
key_projection=False,
|
|
value_projection=False):
|
|
time_upsampling = max(downsample_step, 1)
|
|
|
|
h = encoder_channels
|
|
k = filter_size
|
|
encoder_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
|
|
ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
|
|
ConvSpec(h, k, 1), ConvSpec(h, k, 3))
|
|
|
|
h = decoder_channels
|
|
prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
|
|
attentive_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
|
|
ConvSpec(h, k, 1))
|
|
attention = [True, False, False, False, True]
|
|
|
|
h = converter_channels
|
|
postnet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3))
|
|
|
|
model = DeepVoiceTTS(
|
|
"dv3", n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab,
|
|
embed_dim, padding_idx, embedding_weight_std, freeze_embedding,
|
|
encoder_convolutions, max_positions, padding_idx,
|
|
trainable_positional_encodings, mel_dim, r, prenet_convolutions,
|
|
attentive_convolutions, attention, use_memory_mask,
|
|
force_monotonic_attention, query_position_rate, key_position_rate,
|
|
window_range, key_projection, value_projection, linear_dim,
|
|
postnet_convolutions, time_upsampling, dropout,
|
|
use_decoder_state_for_postnet_input, "float32")
|
|
return model
|
|
|
|
|
|
def deepvoice3_multispeaker(n_vocab,
|
|
embed_dim=256,
|
|
mel_dim=80,
|
|
linear_dim=513,
|
|
r=4,
|
|
downsample_step=1,
|
|
n_speakers=1,
|
|
speaker_dim=16,
|
|
padding_idx=0,
|
|
dropout=(1 - 0.96),
|
|
filter_size=5,
|
|
encoder_channels=128,
|
|
decoder_channels=256,
|
|
converter_channels=256,
|
|
query_position_rate=1.0,
|
|
key_position_rate=1.29,
|
|
use_memory_mask=False,
|
|
trainable_positional_encodings=False,
|
|
force_monotonic_attention=True,
|
|
use_decoder_state_for_postnet_input=True,
|
|
max_positions=512,
|
|
embedding_weight_std=0.1,
|
|
speaker_embedding_weight_std=0.01,
|
|
freeze_embedding=False,
|
|
window_range=WindowRange(-1, 3),
|
|
key_projection=False,
|
|
value_projection=False):
|
|
time_upsampling = max(downsample_step, 1)
|
|
|
|
h = encoder_channels
|
|
k = filter_size
|
|
encoder_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
|
|
ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
|
|
ConvSpec(h, k, 1), ConvSpec(h, k, 3))
|
|
|
|
h = decoder_channels
|
|
prenet_convolutions = (ConvSpec(h, k, 1))
|
|
attentive_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(h, k, 9), ConvSpec(h, k, 27),
|
|
ConvSpec(h, k, 1))
|
|
attention = [True, False, False, False, False]
|
|
|
|
h = converter_channels
|
|
postnet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3),
|
|
ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3))
|
|
|
|
model = DeepVoiceTTS(
|
|
"dv3", n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab,
|
|
embed_dim, padding_idx, embedding_weight_std, freeze_embedding,
|
|
encoder_convolutions, max_positions, padding_idx,
|
|
trainable_positional_encodings, mel_dim, r, prenet_convolutions,
|
|
attentive_convolutions, attention, use_memory_mask,
|
|
force_monotonic_attention, query_position_rate, key_position_rate,
|
|
window_range, key_projection, value_projection, linear_dim,
|
|
postnet_convolutions, time_upsampling, dropout,
|
|
use_decoder_state_for_postnet_input, "float32")
|
|
return model
|