ParakeetEricRoss/deepvoice3/deepvoice3.py

1498 lines
59 KiB
Python

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from itertools import chain
from collections import namedtuple
from paddle import fluid
import paddle.fluid.dygraph as dg
import numpy as np
from modules import conv
from modules.modules import Embedding, PositionEmbedding
from modules.modules import FC, Conv1D, Conv1DGLU, Conv1DTranspose
ConvSpec = namedtuple("ConvSpec", ["out_channels", "filter_size", "dilation"])
WindowRange = namedtuple("WindowRange", ["backward", "ahead"])
def expand_speaker_embed(x, speaker_embed, tdim=-1):
"""
Expand speaker embeddings for multiple timesteps.
Args:
x (Variable): A reference Variable used to determine number of timesteps.
speaker_embed (Variable): Shape(B, C), embeddings of speakers, where
B means batch_size, C means speaker embedding size.
tdim (int, optional): The idex of time dimension in x. Defaults to -1,
which means the last dimension is time dimension.
Returns:
Variable: Shape(B, C, 1, T), the expanded speaker embeddings, where
T = x.shape[tdim]. T means number of timesteps.
"""
speaker_embed = fluid.layers.reshape(
speaker_embed, shape=speaker_embed.shape + [1, 1])
time_steps = x.shape[tdim]
speaker_embed_bc1t = fluid.layers.expand(
speaker_embed, expand_times=[1, 1, 1, time_steps])
return speaker_embed_bc1t
def gen_mask2(valid_lengths, max_len, dtype="float32"):
"""
Generate a mask tensor from valid lengths. note that it return a *reverse*
mask. Indices within valid lengths correspond to 0, and those within
padding area correspond to 1.
Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is
[[0, 0, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1],
[0, 0, 0, 0, 0, 0, 0]].
Args:
valid_lengths (Variable): Shape(B), dtype: int64. A 1D-Tensor containing
the valid lengths (timesteps) of each example, where B means
beatch_size.
max_len (int): The length (number of timesteps) of the mask.
dtype (str, optional): A string that specifies the data type of the
returned mask.
Returns:
mask (Variable): A mask computed from valid lengths.
"""
batch_size = valid_lengths.shape[0]
mask = fluid.layers.sequence_mask(
valid_lengths, maxlen=max_len, dtype=dtype)
mask = 1 - mask
return mask
def expand_mask(mask, attn):
"""
Expand a mask for multiple time steps. This function is used
by the AttentionLayer in the Decoder to expand a mask for every
timestep in the decoder.
Args:
mask (Variable): Shape(B, T_enc), a mask generated with valid
text lengths, where T_enc means encoder length(time steps).
attn (Variable): Shape(B, T_dec, T_enc), a Variable stands for
the alignment tensor between encoder and decoder, where
T_dec means the decoder length(time_steps).
Returns:
mask_btc (Variable): shape(B, T_dec, T_enc), the expanded mask.
"""
decoder_length = attn.shape[1]
mask = fluid.layers.reshape(mask, [mask.shape[0], 1, mask.shape[1]])
mask_btc = fluid.layers.expand(mask, expand_times=[1, decoder_length, 1])
return mask_btc
class Encoder(dg.Layer):
def __init__(self,
name_scope,
n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=0.1,
convolutions=(ConvSpec(64, 5, 1)) * 7,
max_positions=512,
dropout=0.1,
dtype="float32"):
super(Encoder, self).__init__(name_scope, dtype=dtype)
self.dropout = dropout
self.embedding_weight_std = embedding_weight_std
self.embed = Embedding(
self.full_name(),
n_vocab,
embed_dim,
padding_idx=padding_idx,
std=embedding_weight_std,
dtype=dtype)
if n_speakers > 1:
self.sp_proj1 = Conv1D(
self.full_name(),
speaker_dim,
embed_dim,
filter_size=1,
std_mul=1.0,
dropout=dropout,
act="softsign",
dtype=dtype)
self.sp_proj2 = Conv1D(
self.full_name(),
speaker_dim,
embed_dim,
filter_size=1,
std_mul=1.0,
dropout=dropout,
act="softsign",
dtype=dtype)
self.n_speakers = n_speakers
self.convolutions = []
in_channels = embed_dim
std_mul = 1.0
for (out_channels, filter_size, dilation) in convolutions:
# 1 * 1 convolution & relu
if in_channels != out_channels:
self.convolutions.append(
Conv1D(
self.full_name(),
in_channels,
out_channels,
filter_size=1,
std_mul=std_mul,
act="relu",
dtype=dtype))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul=std_mul,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype))
in_channels = out_channels
std_mul = 4.0
self.convolutions.append(
Conv1D(
self.full_name(),
in_channels,
embed_dim,
filter_size=1,
std_mul=std_mul,
dropout=dropout,
dtype=dtype))
for i, layer in enumerate(self.convolutions):
self.add_sublayer("convolution_{}".format(i), layer)
def forward(self, x, speaker_embed=None):
"""
Encode text sequence.
Args:
x (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text
indices. T_enc means the timesteps of decoder input x.
speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
dtype: float32. Speaker embeddings. This arg is not None only
when the model is a multispeaker model.
Returns:
keys (Variable), Shape(B, C_emb, 1, T_enc), the encoded
representation for keys, where C_emb menas the text embedding
size.
values (Variable), Shape(B, C_embed, 1, T_enc), the encoded
representation for values.
"""
x = self.embed(x)
x = fluid.layers.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = fluid.layers.transpose(
fluid.layers.reshape(
x, shape=x.shape + [1]), perm=[0, 2, 3, 1])
speaker_embed_bc1t = None
if speaker_embed is not None:
speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=3)
speaker_embed_bc1t = fluid.layers.dropout(
speaker_embed_bc1t,
self.dropout,
dropout_implementation="upscale_in_train")
x = x + self.sp_proj1(speaker_embed_bc1t)
input_embed = x
for layer in self.convolutions:
if isinstance(layer, Conv1DGLU):
x = layer(x, speaker_embed_bc1t)
else:
x = layer(x)
if speaker_embed is not None:
x = x + self.sp_proj2(speaker_embed_bc1t)
keys = x
values = fluid.layers.scale(input_embed + x, scale=np.sqrt(0.5))
return keys, values
def freeze_embedding(self):
"""Fix text embedding while training."""
for param in self.embed.parameters():
param.trainable = False
class AttentionLayer(dg.Layer):
def __init__(self,
name_scope,
conv_channels,
embed_dim,
dropout=0.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True,
dtype="float32"):
super(AttentionLayer, self).__init__(name_scope, dtype=dtype)
self.query_proj = Conv1D(
self.full_name(),
conv_channels,
embed_dim,
filter_size=1,
dtype=dtype)
if key_projection:
self.key_proj = Conv1D(
self.full_name(),
embed_dim,
embed_dim,
filter_size=1,
dtype=dtype)
if value_projection:
self.value_proj = Conv1D(
self.full_name(),
embed_dim,
embed_dim,
filter_size=1,
dtype=dtype)
self.out_proj = Conv1D(
self.full_name(),
embed_dim,
conv_channels,
filter_size=1,
dtype=dtype)
self.key_projection = key_projection
self.value_projection = value_projection
self.dropout = dropout
self.window_range = window_range
def forward(self, query, encoder_out, mask=None, last_attended=None):
"""
Compute pooled context representation and alignment scores.
Args:
query (Variable): shape(B, C_q, 1, T_dec), the query tensor,
where C_q means the channel of query.
encoder_out (Tuple(Variable, Variable)):
keys (Variable): shape(B, C_emb, 1, T_enc), the key
representation from an encoder, where C_emb means
text embedding size.
values (Variable): shape(B, C_emb, 1, T_enc), the value
representation from an encoder, where C_emb means
text embedding size.
mask (Variable, optional): Shape(B, T_enc), mask generated with
valid text lengths.
last_attended (int, optional): The position that received most
attention at last timestep. This is only used at decoding.
Outpus:
x (Variable): Shape(B, C_q, 1, T_dec), the context representation
pooled from attention mechanism.
attn_scores (Variable): shape(B, T_dec, T_enc), the alignment
tensor, where T_dec means the number of decoder time steps and
T_enc means number the number of decoder time steps.
"""
keys, values = encoder_out
residual = query
if self.value_projection:
values = self.value_proj(values)
if self.key_projection:
keys = self.key_proj(keys)
x = self.query_proj(query)
batch_size, conv_channels, _, decoder_length = query.shape
encoder_length = keys.shape[-1]
embed_dim = keys.shape[1]
x = fluid.layers.matmul(
fluid.layers.reshape(
x, shape=[batch_size, embed_dim, decoder_length]),
fluid.layers.reshape(
keys, shape=[batch_size, embed_dim, encoder_length]),
transpose_x=True)
mask_value = -1.0e30
if mask is not None:
mask = expand_mask(mask, x)
neg_inf_mask = fluid.layers.scale(mask, mask_value)
x = x + neg_inf_mask
# if last_attended is provided, focus only on a window range around it
# to enforce monotonic attention.
if last_attended is not None:
locality_mask = np.ones(shape=x.shape, dtype=np.float32)
backward, ahead = self.window_range
backward = last_attended + backward
ahead = last_attended + ahead
if backward < 0:
backward = 0
if ahead > x.shape[-1]:
ahead = x.shape[-1]
locality_mask[:, :, backward:ahead] = 0.
locality_mask = dg.to_variable(locality_mask)
neg_inf_mask = fluid.layers.scale(locality_mask, mask_value)
x = x + neg_inf_mask
x = fluid.layers.softmax(x)
attn_scores = x
x = fluid.layers.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = fluid.layers.matmul(
fluid.layers.reshape(
values, shape=[batch_size, embed_dim, encoder_length]),
x,
transpose_y=True)
x = fluid.layers.reshape(x, [batch_size, embed_dim, 1, decoder_length])
x = fluid.layers.scale(x,
encoder_length * np.sqrt(1.0 / encoder_length))
x = self.out_proj(x)
x = fluid.layers.scale((x + residual), np.sqrt(0.5))
return x, attn_scores
class Decoder(dg.Layer):
def __init__(self,
name_scope,
n_speakers,
speaker_dim,
embed_dim,
mel_dim=80,
r=5,
max_positions=512,
padding_idx=None,
preattention=(ConvSpec(128, 5, 1)) * 4,
convolutions=(ConvSpec(128, 5, 1)) * 4,
attention=True,
dropout=0.1,
use_memory_mask=False,
force_monotonic_attention=False,
query_position_rate=1.0,
key_position_rate=1.29,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True,
dtype="float32"):
super(Decoder, self).__init__(name_scope, dtype=dtype)
self.dropout = dropout
self.mel_dim = mel_dim
self.r = r
self.query_position_rate = query_position_rate
self.key_position_rate = key_position_rate
self.window_range = window_range
self.n_speakers = n_speakers
conv_channels = convolutions[0].out_channels
self.embed_query_positions = PositionEmbedding(
self.full_name(),
max_positions,
conv_channels,
padding_idx=padding_idx,
dtype=dtype)
self.embed_keys_positions = PositionEmbedding(
self.full_name(),
max_positions,
embed_dim,
padding_idx=padding_idx,
dtype=dtype)
# Used to compute multiplier for position rate
if n_speakers > 1:
self.speaker_proj1 = FC(self.full_name(),
speaker_dim,
1,
act="sigmoid",
dropout=dropout,
dtype=dtype)
self.speaker_proj2 = FC(self.full_name(),
speaker_dim,
1,
act="sigmoid",
dropout=dropout,
dtype=dtype)
# prenet
self.prenet = []
in_channels = mel_dim * r
std_mul = 1.0
for (out_channels, filter_size, dilation) in preattention:
if in_channels != out_channels:
# conv1d & relu
self.prenet.append(
Conv1D(
self.full_name(),
in_channels,
out_channels,
filter_size=1,
std_mul=std_mul,
act="relu"))
in_channels = out_channels
std_mul = 2.0
self.prenet.append(
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul=std_mul,
dropout=dropout,
causal=True,
residual=True,
dtype=dtype))
in_channels = out_channels
std_mul = 4.0
for i, layer in enumerate(self.prenet):
self.add_sublayer("prenet_{}".format(i), layer)
self.use_memory_mask = use_memory_mask
if isinstance(attention, bool):
self.attention = [attention] * len(convolutions)
else:
self.attention = attention
if isinstance(force_monotonic_attention, bool):
self.force_monotonic_attention = [force_monotonic_attention
] * len(convolutions)
else:
self.force_monotonic_attention = force_monotonic_attention
# causual convolution & attention
self.conv_attn = []
for use_attention, (out_channels, filter_size,
dilation) in zip(self.attention, convolutions):
assert (
in_channels == out_channels
), "the stack of convolution & attention does not change channels"
conv_layer = Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul=std_mul,
dropout=dropout,
causal=True,
residual=False,
dtype=dtype)
attn_layer = (AttentionLayer(
self.full_name(),
out_channels,
embed_dim,
dropout=dropout,
window_range=window_range,
key_projection=key_projection,
value_projection=value_projection,
dtype=dtype) if use_attention else None)
in_channels = out_channels
std_mul = 4.0
self.conv_attn.append((conv_layer, attn_layer))
for i, (conv_layer, attn_layer) in enumerate(self.conv_attn):
self.add_sublayer("conv_{}".format(i), conv_layer)
if attn_layer is not None:
self.add_sublayer("attn_{}".format(i), attn_layer)
# 1 * 1 conv to transform channels
self.last_conv = Conv1D(
self.full_name(),
in_channels,
mel_dim * r,
filter_size=1,
std_mul=std_mul,
dropout=dropout,
dtype=dtype)
# mel (before sigmoid) to done hat
self.fc = Conv1D(
self.full_name(), mel_dim * r, 1, filter_size=1, dtype=dtype)
# decoding configs
self.max_decoder_steps = 200
self.min_decoder_steps = 10
def freeze_positional_encoding(self):
for param in self.embed_query_positions.parameters():
param.trainable = False
for param in self.embed_keys_positions.parameters():
param.trainable = False
def forward(self,
encoder_out,
lengths,
inputs,
text_positions,
frame_positions,
speaker_embed=None):
"""
Compute decoder outputs with ground truth mel spectrogram.
Args:
encoder_out (Tuple(Variable, Variable)):
keys (Variable): shape(B, C_emb, 1, T_enc), the key
representation from an encoder, where C_emb means
text embedding size.
values (Variable): shape(B, C_emb, 1, T_enc), the value
representation from an encoder, where C_emb means
text embedding size.
lengths (Variable): Shape(batch_size,), dtype: int64, valid lengths
of text inputs for each example.
inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
mel-spectrogram, which is used as decoder inputs when training.
text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
frame_positions (Variable): Shape(B, T_dec // r, 1), dtype:
int64. Positions indices for each decoder time steps.
speaker_embed: shape(batch_size, speaker_dim), speaker embedding,
only used for multispeaker model.
Returns:
outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of mel spectrogram. Note that, when r > 1, the decoder
outputs r frames of mel spectrogram per step.
alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, where N means number
of Attention Layers, T_mel means the length of mel spectrogram,
r means the outputs per decoder step, T_enc means the encoder
time steps.
done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
outputs should stop.
decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
hidden states, where C_dec means the channels of decoder states.
"""
# pack multiple frames if necessary
B, _, _, T = inputs.shape
if self.r > 1 and inputs.shape[1] == self.mel_dim:
if T % self.r != 0:
inputs = fluid.layers.slice(
inputs, axes=[3], starts=[0], ends=[T - T % self.r])
inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1])
inputs = fluid.layers.reshape(
inputs, shape=[B, -1, 1, self.mel_dim * self.r])
inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1])
assert inputs.shape[3] == T // self.r
if speaker_embed is not None:
speaker_embed_bc1t = expand_speaker_embed(inputs, speaker_embed)
speaker_embed_bc1t = fluid.layers.dropout(
speaker_embed_bc1t,
self.dropout,
dropout_implementation="upscale_in_train")
else:
speaker_embed_bc1t = None
keys, values = encoder_out
if self.use_memory_mask and lengths is not None:
mask = gen_mask2(lengths, keys.shape[-1])
else:
mask = None
if text_positions is not None:
w = self.key_position_rate
if self.n_speakers > 1:
w = w * fluid.layers.reshape(
self.speaker_proj1(speaker_embed), [B, -1])
text_pos_embed = self.embed_keys_positions(text_positions, w)
text_pos_embed = fluid.layers.transpose(
fluid.layers.reshape(
text_pos_embed, shape=text_pos_embed.shape + [1]),
perm=[0, 2, 3, 1])
keys = keys + text_pos_embed
if frame_positions is not None:
w = self.query_position_rate
if self.n_speakers > 1:
w = w * fluid.layers.reshape(
self.speaker_proj2(speaker_embed), [B, -1])
frame_pos_embed = self.embed_query_positions(frame_positions, w)
frame_pos_embed = fluid.layers.transpose(
fluid.layers.reshape(
frame_pos_embed, shape=frame_pos_embed.shape + [1]),
perm=[0, 2, 3, 1])
else:
frame_pos_embed = None
x = inputs
x = fluid.layers.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
x = (layer(x, speaker_embed_bc1t)
if isinstance(layer, Conv1DGLU) else layer(x))
# Convolution & Multi-hop Attention
alignments = []
for conv, attn in self.conv_attn:
residual = x
x = conv(x, speaker_embed_bc1t)
if attn is not None:
if frame_pos_embed is not None:
x = x + frame_pos_embed
x, attn_scores = attn(x, (keys, values), mask)
alignments.append(attn_scores)
x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5))
alignments = fluid.layers.stack(alignments)
decoder_states = x
x = self.last_conv(x)
outputs = fluid.layers.sigmoid(x)
done = fluid.layers.sigmoid(self.fc(x))
return outputs, alignments, done, decoder_states
def decode(self,
encoder_out,
text_positions,
speaker_embed=None,
initial_input=None,
test_inputs=None):
"""
Decode without ground truth mel spectrogram.
Args:
encoder_out (Tuple(Variable, Variable)):
keys (Variable): shape(B, C_emb, 1, T_enc), the key
representation from an encoder, where C_emb means
text embedding size.
values (Variable): shape(B, C_emb, 1, T_enc), the value
representation from an encoder, where C_emb means
text embedding size.
text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
speaker_embed (Variable): Shape(B, C_sp), where C_sp means
speaker embedding size. It is only used for multispeaker model.
initial_input (Variable, optional): Shape(B, C_mel * r, 1, 1).
The input for the first time step of the decoder. If r > 0,
it is a packed r frames of mel spectrograms.
test_inputs (Variable, optional): Shape(B, C_mel, 1, T_test),
where T_test means the time steps of test inputs. This is
only used for testing this method, the user should just leave
it None.
Returns:
outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of output mel spectrogram. Note that, when r > 1,
the decoder outputs r frames of mel spectrogram per step.
alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, T_mel means the
length of output mel spectrogram, r means the outputs per
decoder step, T_enc means the encoder time steps.
done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
outputs stops.
decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
hidden states, where C_dec means the channels of decoder states.
"""
self.start_new_sequence()
keys, values = encoder_out
B = keys.shape[0]
assert B == 1, "now only supports single instance inference"
mask = None # no mask because we use single instance decoding
w = self.key_position_rate
if speaker_embed is not None:
if self.n_speakers > 1:
w = w * fluid.layers.reshape(
self.speaker_proj1(speaker_embed), shape=[B, -1])
speaker_embed_bc11 = fluid.layers.reshape(
speaker_embed, shape=[B, speaker_embed.shape[1], 1, 1])
else:
speaker_embed_bc11 = None
if text_positions is not None:
text_pos_embed = self.embed_keys_positions(text_positions, w)
text_pos_embed = fluid.layers.transpose(
fluid.layers.reshape(
text_pos_embed, shape=text_pos_embed.shape + [1]),
perm=[0, 2, 3, 1])
keys = keys + text_pos_embed
# start decoding, init accumulators
decoder_states = []
outputs = []
alignments = []
dones = []
last_attended = [None] * len(self.conv_attn)
for idx, monotonic_attn in enumerate(self.force_monotonic_attention):
if monotonic_attn:
last_attended[idx] = 0
t = 0 # decoder time step
if initial_input is None:
initial_input = fluid.layers.zeros(
shape=[B, self.mel_dim * self.r, 1, 1], dtype=keys.dtype)
current_input = initial_input
while True:
frame_pos = fluid.layers.fill_constant(
shape=[B, 1, 1], value=t + 1, dtype="int64")
w = self.query_position_rate
if self.n_speakers > 1:
w = w * fluid.layers.reshape(
self.speaker_proj2(speaker_embed), shape=[B, -1])
frame_pos_embed = self.embed_query_positions(frame_pos, w)
frame_pos_embed = fluid.layers.transpose(
fluid.layers.reshape(
frame_pos_embed, shape=frame_pos_embed.shape + [1]),
perm=[0, 2, 3, 1])
if test_inputs is not None:
if t >= test_inputs.shape[3]:
break
current_input = fluid.layers.reshape(
test_inputs[:, :, :, t],
shape=[B, test_inputs.shape[1], 1, 1])
else:
if t > 0:
current_input = outputs[-1]
x = current_input
x = fluid.layers.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
x = (layer.add_input(x, speaker_embed_bc11)
if isinstance(layer, Conv1DGLU) else layer.add_input(x))
step_attn_scores = []
# Casual convolutions + Multi-hop attentions
for i, (conv, attn) in enumerate(self.conv_attn):
residual = x
x = conv.add_input(x, speaker_embed_bc11)
if attn is not None:
if frame_pos_embed is not None:
x = x + frame_pos_embed
x, attn_scores = attn(x, (keys, values), mask,
last_attended[i])
step_attn_scores.append(attn_scores)
# update last attended when necessary
if self.force_monotonic_attention[i]:
last_attended[i] = np.argmax(
attn_scores.numpy(), axis=-1)[0][0]
x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5))
if len(step_attn_scores):
average_attn_scores = fluid.layers.reduce_mean(
fluid.layers.stack(step_attn_scores), dim=0)
else:
average_attn_scores = None
decoder_state = x
x = self.last_conv.add_input(x)
output = fluid.layers.sigmoid(x) # (B, r * C_mel, 1, 1)
done = fluid.layers.sigmoid(self.fc(x)) # (B, 1, 1, 1)
decoder_states.append(decoder_state)
outputs.append(output)
if average_attn_scores is not None:
alignments.append(average_attn_scores)
dones.append(done)
t += 1
if test_inputs is None:
if (fluid.layers.reduce_min(done).numpy()[0] > 0.5 and
t > self.min_decoder_steps):
break
elif t > self.max_decoder_steps:
break
outputs = fluid.layers.concat(outputs, axis=3)
if len(alignments):
alignments = fluid.layers.concat(alignments, axis=1)
else:
alignments = None
dones = fluid.layers.concat(dones, axis=3)
decoder_states = fluid.layers.concat(decoder_states, axis=3)
return outputs, alignments, dones, decoder_states
def start_new_sequence(self):
for layer in self.sublayers():
if isinstance(layer, conv.Conv1D):
layer.start_new_sequence()
class Converter(dg.Layer):
"""
Vocoder that transforms mel spectrogram (or ecoder hidden states)
to waveform.
"""
def __init__(self,
name_scope,
n_speakers,
speaker_dim,
in_channels,
linear_dim,
convolutions=(ConvSpec(256, 5, 1)) * 4,
time_upsampling=1,
dropout=0.1,
dtype="float32"):
super(Converter, self).__init__(name_scope, dtype=dtype)
self.n_speakers = n_speakers
self.speaker_dim = speaker_dim
self.in_channels = in_channels
self.linear_dim = linear_dim
self.time_upsampling = time_upsampling
self.dropout = dropout
target_channels = convolutions[0][0]
# conv proj to target channels
self.first_conv_proj = Conv1D(
self.full_name(),
in_channels,
target_channels,
filter_size=1,
std_mul=1.0,
dtype=dtype)
# Idea from nyanko
# upsampling convolitions
if time_upsampling == 4:
self.upsampling_convolutions = [
Conv1DTranspose(
self.full_name(),
target_channels,
target_channels,
filter_size=2,
padding=0,
stride=2,
std_mul=1.0,
dtype=dtype),
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=1,
std_mul=1.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype),
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=3,
std_mul=4.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype),
Conv1DTranspose(
self.full_name(),
target_channels,
target_channels,
filter_size=2,
padding=0,
stride=2,
std_mul=4.0,
dtype=dtype),
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=1,
std_mul=1.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype),
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=3,
std_mul=4.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype),
]
elif time_upsampling == 2:
self.upsampling_convolutions = [
Conv1DTranspose(
self.full_name(),
target_channels,
target_channels,
filter_size=2,
padding=0,
stride=2,
std_mul=1.0,
dtype=dtype),
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=1,
std_mul=1.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype),
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=3,
std_mul=4.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype),
]
elif time_upsampling == 1:
self.upsampling_convolutions = [
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
target_channels,
target_channels,
filter_size=3,
dilation=3,
std_mul=4.0,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype)
]
else:
raise ValueError("Not supported.")
for i, layer in enumerate(self.upsampling_convolutions):
self.add_sublayer("upsampling_convolutions_{}".format(i), layer)
# post conv layers
std_mul = 4.0
in_channels = target_channels
self.convolutions = []
for (out_channels, filter_size, dilation) in convolutions:
if in_channels != out_channels:
self.convolutions.append(
Conv1D(
self.full_name(),
in_channels,
out_channels,
filter_size=1,
std_mul=std_mul,
act="relu",
dtype=dtype))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(
self.full_name(),
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size=filter_size,
dilation=dilation,
std_mul=std_mul,
dropout=dropout,
causal=False,
residual=True,
dtype=dtype))
in_channels = out_channels
std_mul = 4.0
for i, layer in enumerate(self.convolutions):
self.add_sublayer("convolutions_{}".format(i), layer)
# final conv proj, channel transformed to linear dim
self.last_conv_proj = Conv1D(
self.full_name(),
in_channels,
linear_dim,
filter_size=1,
std_mul=std_mul,
dropout=dropout,
act="sigmoid",
dtype=dtype)
def forward(self, x, speaker_embed=None):
"""
Convert mel spectrogram or decoder hidden states to linear spectrogram.
Args:
x (Variable): Shape(B, C_in, 1, T_mel), converter inputs, where
C_in means the input channel for the converter. Note that it
can be either C_mel (channel of mel spectrogram) or C_dec // r.
When use mel_spectrogram as the input of converter, C_in =
C_mel; and when use decoder states as the input of converter,
C_in = C_dec // r. In this scenario, decoder hidden states are
treated as if they were r outputs per decoder step and are
unpacked before passing to the converter.
speaker_embed (Variable, optional): shape(B, C_sp), speaker
embedding, where C_sp means the speaker embedding size.
Returns:
out (Variable): Shape(B, C_lin, 1, T_lin), the output linear
spectrogram, where C_lin means the channel of linear
spectrogram and T_linear means the length(time steps) of linear
spectrogram. T_line = time_upsampling * T_mel, which depends
on the time_upsampling converter.
"""
speaker_embed_bc1t = None
if speaker_embed is not None:
speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=-1)
speaker_embed_bc1t = fluid.layers.dropout(
speaker_embed_bc1t,
self.dropout,
dropout_implementation="upscale_in_train")
x = self.first_conv_proj(x)
for layer in chain(self.upsampling_convolutions, self.convolutions):
# time_steps may change when timt_upsampling > 1
if (speaker_embed_bc1t is not None and
speaker_embed_bc1t.shape[3] != x.shape[3]):
speaker_embed_bc1t = expand_speaker_embed(
x, speaker_embed, tdim=3)
speaker_embed_bc1t = fluid.layers.dropout(
speaker_embed_bc1t,
self.dropout,
dropout_implementation="upscale_in_train")
x = (layer(x, speaker_embed_bc1t)
if isinstance(layer, Conv1DGLU) else layer(x))
out = self.last_conv_proj(x)
return out
class DeepVoiceTTS(dg.Layer):
def __init__(self, name_scope, n_speakers, speaker_dim,
speaker_embedding_weight_std, n_vocab, embed_dim,
text_padding_idx, text_embedding_weight_std,
freeze_text_embedding, encoder_convolutions, max_positions,
position_padding_idx, trainable_positional_encodings, mel_dim,
r, prenet_convolutions, attentive_convolutions, attention,
use_memory_mask, force_monotonic_attention,
query_position_rate, key_position_rate, window_range,
key_projection, value_projection, linear_dim,
postnet_convolutions, time_upsampling, dropout,
use_decoder_state_for_postnet_input, dtype):
super(DeepVoiceTTS, self).__init__(name_scope, dtype)
self.n_speakers = n_speakers
self.speaker_dim = speaker_dim
if n_speakers > 1:
self.speaker_embedding = Embedding(
self.full_name(),
n_speakers,
speaker_dim,
padding_idx=None,
std=speaker_embedding_weight_std,
dtype=dtype)
self.embed_dim = embed_dim
self.mel_dim = mel_dim
self.r = r
self.seq2seq = ConvS2S(
self.full_name(), n_speakers, speaker_dim,
speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx,
text_embedding_weight_std, freeze_text_embedding,
encoder_convolutions, max_positions, position_padding_idx,
trainable_positional_encodings, mel_dim, r, prenet_convolutions,
attentive_convolutions, attention, use_memory_mask,
force_monotonic_attention, query_position_rate, key_position_rate,
window_range, key_projection, value_projection, dropout, dtype)
self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input
if use_decoder_state_for_postnet_input:
assert (
attentive_convolutions[-1].out_channels % self.r == 0
), "when using decoder states as converter input, you must assure the decoder state channels can be divided by r"
converter_input_channels = attentive_convolutions[
-1].out_channels // r
else:
converter_input_channels = mel_dim
self.converter_input_channels = converter_input_channels
self.linear_dim = linear_dim
self.converter = Converter(
self.full_name(),
n_speakers,
speaker_dim,
converter_input_channels,
linear_dim,
convolutions=postnet_convolutions,
time_upsampling=time_upsampling,
dropout=dropout,
dtype=dtype)
def forward(self,
text_sequences,
valid_lengths,
mel_inputs,
speaker_indices=None,
text_positions=None,
frame_positions=None):
"""
Encode text sequence and decode with ground truth mel spectrogram.
Args:
text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
input text indices. T_enc means the timesteps of text_sequences.
valid_lengths (Variable): shape(batch_size,), dtype: int64,
valid lengths for each example in text_sequences.
mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
mel-spectrogram, which is used as decoder inputs when training.
speaker_indices (Variable, optional): Shape(Batch_size, 1),
dtype: int64. Speaker index for each example. This arg is not
None only when the model is a multispeaker model.
text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
frame_positions (Variable): Shape(B, T_dec // r, 1), dtype:
int64. Positions indices for each decoder time steps.
Returns:
mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of mel spectrogram. Note that, when r > 1, the decoder
outputs r frames of mel spectrogram per step.
linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output
linear spectrogram, where C_lin means the channel of linear
spectrogram and T_linear means the length(time steps) of linear
spectrogram. T_line = time_upsampling * T_mel, which depends
on the time_upsampling converter.
alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, where N means number
of Attention Layers, T_mel means the length of mel spectrogram,
r means the outputs per decoder step, T_enc means the encoder
time steps.
done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
outputs should stop.
"""
batch_size = text_sequences.shape[0]
if self.n_speakers == 1:
assert speaker_indices is None, "this model does not support multi-speaker"
if speaker_indices is not None:
speaker_embed = self.speaker_embedding(speaker_indices)
else:
speaker_embed = None
mel_outputs, alignments, done, decoder_states = self.seq2seq(
text_sequences, valid_lengths, mel_inputs, speaker_embed,
text_positions, frame_positions)
# unpack multi frames
if self.r > 1:
mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])
mel_outputs = fluid.layers.reshape(
mel_outputs, [batch_size, -1, 1, self.mel_dim])
mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])
if self.use_decoder_state_for_postnet_input:
postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1])
postnet_input = fluid.layers.reshape(
postnet_input,
[batch_size, -1, 1, self.converter_input_channels])
postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1])
else:
postnet_input = mel_outputs
linear_outputs = self.converter(postnet_input, speaker_embed)
return mel_outputs, linear_outputs, alignments, done
def transduce(self, text_sequences, text_positions, speaker_indices=None):
"""
Encode text sequence and decode without ground truth mel spectrogram.
Args:
text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
input text indices. T_enc means the timesteps of text_sequences.
text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
speaker_indices (Variable, optional): Shape(Batch_size, 1),
dtype: int64. Speaker index for each example. This arg is not
None only when the model is a multispeaker model.
Returns:
mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of mel spectrogram. Note that, when r > 1, the decoder
outputs r frames of mel spectrogram per step.
linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output
linear spectrogram, where C_lin means the channel of linear
spectrogram and T_linear means the length(time steps) of linear
spectrogram. T_line = time_upsampling * T_mel, which depends
on the time_upsampling converter.
alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, T_mel means the
length of mel spectrogram, r means the outputs per decoder
step, T_enc means the encoder time steps.
done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
outputs should stop.
"""
batch_size = text_sequences.shape[0]
if speaker_indices is not None:
speaker_embed = self.speaker_embedding(speaker_indices)
else:
speaker_embed = None
mel_outputs, alignments, done, decoder_states = self.seq2seq.transduce(
text_sequences, text_positions, speaker_embed)
if self.r > 1:
mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])
mel_outputs = fluid.layers.reshape(
mel_outputs, [batch_size, -1, 1, self.mel_dim])
mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1])
if self.use_decoder_state_for_postnet_input:
postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1])
postnet_input = fluid.layers.reshape(
postnet_input,
[batch_size, -1, 1, self.converter_input_channels])
postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1])
else:
postnet_input = mel_outputs
linear_outputs = self.converter(postnet_input, speaker_embed)
return mel_outputs, linear_outputs, alignments, done
class ConvS2S(dg.Layer):
def __init__(self, name_scope, n_speakers, speaker_dim,
speaker_embedding_weight_std, n_vocab, embed_dim,
text_padding_idx, text_embedding_weight_std,
freeze_text_embedding, encoder_convolutions, max_positions,
position_padding_idx, trainable_positional_encodings, mel_dim,
r, prenet_convolutions, attentive_convolutions, attention,
use_memory_mask, force_monotonic_attention,
query_position_rate, key_position_rate, window_range,
key_projection, value_projection, dropout, dtype):
super(ConvS2S, self).__init__(name_scope, dtype)
self.freeze_text_embedding = freeze_text_embedding
self.trainable_positional_encodings = trainable_positional_encodings
self.n_speakers = n_speakers
self.speaker_dim = speaker_dim
self.embed_dim = embed_dim
self.encoder = Encoder(
self.full_name(),
n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=text_embedding_weight_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
dropout=dropout,
dtype=dtype)
if freeze_text_embedding:
self.encoder.freeze_embedding()
self.mel_dim = mel_dim
self.r = r
self.decoder = Decoder(
self.full_name(),
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r,
max_positions,
position_padding_idx,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=window_range,
key_projection=key_projection,
value_projection=key_projection,
dtype=dtype)
if not trainable_positional_encodings:
self.decoder.freeze_positional_encoding()
def forward(self,
text_sequences,
valid_lengths,
mel_inputs,
speaker_embed=None,
text_positions=None,
frame_positions=None):
"""
Encode text sequence and decode with ground truth mel spectrogram.
Args:
text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
input text indices. T_enc means the timesteps of text_sequences.
valid_lengths (Variable): shape(batch_size,), dtype: int64,
valid lengths for each example in text_sequences.
mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
mel-spectrogram, which is used as decoder inputs when training.
speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
dtype: float32. Speaker embeddings. This arg is not None only
when the model is a multispeaker model.
text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
frame_positions (Variable): Shape(B, T_dec // r, 1), dtype:
int64. Positions indices for each decoder time steps.
Returns:
mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of mel spectrogram. Note that, when r > 1, the decoder
outputs r frames of mel spectrogram per step.
alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, where N means number
of Attention Layers, T_mel means the length of mel spectrogram,
r means the outputs per decoder step, T_enc means the encoder
time steps.
done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
outputs should stop.
decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
hidden states, where C_dec means the channels of decoder states.
"""
keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder(
(keys, values), valid_lengths, mel_inputs, text_positions,
frame_positions, speaker_embed)
return mel_outputs, alignments, done, decoder_states
def transduce(self, text_sequences, text_positions, speaker_embed=None):
"""
Encode text sequence and decode without ground truth mel spectrogram.
Args:
text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
input text indices. T_enc means the timesteps of text_sequences.
text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
Positions indices for text inputs for the encoder, where
T_enc means the encoder timesteps.
speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
dtype: float32. Speaker embeddings. This arg is not None only
when the model is a multispeaker model.
Returns:
mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder
outputs, where C_mel means the channels of mel-spectrogram, r
means the outputs per decoder step, T_mel means the length(time
steps) of mel spectrogram. Note that, when r > 1, the decoder
outputs r frames of mel spectrogram per step.
alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment
tensor between the decoder and the encoder, T_mel means the
length of mel spectrogram, r means the outputs per decoder
step, T_enc means the encoder time steps.
done (Variable): Shape(B, 1, 1, T_mel // r), probability that the
outputs should stop.
decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder
hidden states, where C_dec means the channels of decoder states.
"""
keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder.decode(
(keys, values), text_positions, speaker_embed)
return mel_outputs, alignments, done, decoder_states