update Conv1D and Linear
This commit is contained in:
parent
53f569a519
commit
f5ac04b1a3
|
@ -1,14 +1,14 @@
|
||||||
audio:
|
audio:
|
||||||
num_mels: 80
|
num_mels: 80 #the number of mel bands when calculating mel spectrograms.
|
||||||
n_fft: 2048
|
n_fft: 2048 #the number of fft components.
|
||||||
sr: 22050
|
sr: 22050 #the sampling rate of audio data file.
|
||||||
preemphasis: 0.97
|
preemphasis: 0.97 #the preemphasis coefficient.
|
||||||
hop_length: 256
|
hop_length: 256 #the number of samples to advance between frames.
|
||||||
win_length: 1024
|
win_length: 1024 #the length (width) of the window function.
|
||||||
power: 1.2
|
power: 1.2 #the power to raise before griffin-lim.
|
||||||
min_level_db: -100
|
min_level_db: -100 #the minimum level db.
|
||||||
ref_level_db: 20
|
ref_level_db: 20 #the reference level db.
|
||||||
outputs_per_step: 1
|
outputs_per_step: 1 #the outputs per step.
|
||||||
|
|
||||||
encoder_n_layer: 6
|
encoder_n_layer: 6
|
||||||
encoder_head: 2
|
encoder_head: 2
|
||||||
|
@ -35,12 +35,12 @@ epochs: 10000
|
||||||
lr: 0.001
|
lr: 0.001
|
||||||
save_step: 500
|
save_step: 500
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: False
|
use_data_parallel: True
|
||||||
|
|
||||||
data_path: ../../dataset/LJSpeech-1.1
|
data_path: ../../dataset/LJSpeech-1.1
|
||||||
transtts_path: ../TransformerTTS/checkpoint/
|
transtts_path: ../TransformerTTS/checkpoint/
|
||||||
transformer_step: 200000
|
transformer_step: 160000
|
||||||
save_path: ./checkpoint
|
save_path: ./checkpoint
|
||||||
log_dir: ./log
|
log_dir: ./log
|
||||||
#checkpoint_path: ./checkpoint
|
#checkpoint_path: ./checkpoint
|
||||||
#ransformer_step: 97000
|
#transformer_step: 97000
|
||||||
|
|
|
@ -51,7 +51,6 @@ def main(cfg):
|
||||||
with fluid.unique_name.guard():
|
with fluid.unique_name.guard():
|
||||||
transformerTTS = TransformerTTS(cfg)
|
transformerTTS = TransformerTTS(cfg)
|
||||||
model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer"))
|
model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer"))
|
||||||
|
|
||||||
transformerTTS.set_dict(model_dict)
|
transformerTTS.set_dict(model_dict)
|
||||||
transformerTTS.eval()
|
transformerTTS.eval()
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ lr: 0.001
|
||||||
save_step: 1000
|
save_step: 1000
|
||||||
image_step: 2000
|
image_step: 2000
|
||||||
use_gpu: True
|
use_gpu: True
|
||||||
use_data_parallel: True
|
use_data_parallel: False
|
||||||
stop_token: False
|
stop_token: False
|
||||||
|
|
||||||
data_path: ../../dataset/LJSpeech-1.1
|
data_path: ../../dataset/LJSpeech-1.1
|
||||||
|
|
|
@ -83,21 +83,21 @@ class DurationPredictor(dg.Layer):
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
|
|
||||||
k = math.sqrt(1 / self.input_size)
|
k = math.sqrt(1 / self.input_size)
|
||||||
self.conv1 = Conv1D(in_channels = self.input_size,
|
self.conv1 = Conv1D(num_channels = self.input_size,
|
||||||
out_channels = self.out_channels,
|
num_filters = self.out_channels,
|
||||||
filter_size = self.filter_size,
|
filter_size = self.filter_size,
|
||||||
padding=1,
|
padding=1,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||||
data_format='NTC')
|
#data_format='NTC')
|
||||||
k = math.sqrt(1 / self.out_channels)
|
k = math.sqrt(1 / self.out_channels)
|
||||||
self.conv2 = Conv1D(in_channels = self.out_channels,
|
self.conv2 = Conv1D(num_channels = self.out_channels,
|
||||||
out_channels = self.out_channels,
|
num_filters = self.out_channels,
|
||||||
filter_size = self.filter_size,
|
filter_size = self.filter_size,
|
||||||
padding=1,
|
padding=1,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||||
data_format='NTC')
|
#data_format='NTC')
|
||||||
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
self.layer_norm1 = dg.LayerNorm(self.out_channels)
|
||||||
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
self.layer_norm2 = dg.LayerNorm(self.out_channels)
|
||||||
|
|
||||||
|
@ -118,11 +118,18 @@ class DurationPredictor(dg.Layer):
|
||||||
out (Variable), Shape(B, T, C), the output of duration predictor.
|
out (Variable), Shape(B, T, C), the output of duration predictor.
|
||||||
"""
|
"""
|
||||||
# encoder_output.shape(N, T, C)
|
# encoder_output.shape(N, T, C)
|
||||||
out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout)
|
out = layers.transpose(encoder_output, [0,2,1])
|
||||||
out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout)
|
out = self.conv1(out)
|
||||||
|
out = layers.transpose(out, [0,2,1])
|
||||||
|
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
|
||||||
|
out = layers.transpose(out, [0,2,1])
|
||||||
|
out = self.conv2(out)
|
||||||
|
out = layers.transpose(out, [0,2,1])
|
||||||
|
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
|
||||||
out = layers.relu(self.linear(out))
|
out = layers.relu(self.linear(out))
|
||||||
out = layers.squeeze(out, axes=[-1])
|
out = layers.squeeze(out, axes=[-1])
|
||||||
|
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,22 +24,20 @@ class CBHG(dg.Layer):
|
||||||
self.projection_size = projection_size
|
self.projection_size = projection_size
|
||||||
self.conv_list = []
|
self.conv_list = []
|
||||||
k = math.sqrt(1 / projection_size)
|
k = math.sqrt(1 / projection_size)
|
||||||
self.conv_list.append(Conv1D(in_channels = projection_size,
|
self.conv_list.append(Conv1D(num_channels = projection_size,
|
||||||
out_channels = hidden_size,
|
num_filters = hidden_size,
|
||||||
filter_size = 1,
|
filter_size = 1,
|
||||||
padding = int(np.floor(1/2)),
|
padding = int(np.floor(1/2)),
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
|
||||||
data_format = "NCT"))
|
|
||||||
k = math.sqrt(1 / hidden_size)
|
k = math.sqrt(1 / hidden_size)
|
||||||
for i in range(2,K+1):
|
for i in range(2,K+1):
|
||||||
self.conv_list.append(Conv1D(in_channels = hidden_size,
|
self.conv_list.append(Conv1D(num_channels = hidden_size,
|
||||||
out_channels = hidden_size,
|
num_filters = hidden_size,
|
||||||
filter_size = i,
|
filter_size = i,
|
||||||
padding = int(np.floor(i/2)),
|
padding = int(np.floor(i/2)),
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
|
||||||
data_format = "NCT"))
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.conv_list):
|
for i, layer in enumerate(self.conv_list):
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||||
|
@ -55,22 +53,20 @@ class CBHG(dg.Layer):
|
||||||
conv_outdim = hidden_size * K
|
conv_outdim = hidden_size * K
|
||||||
|
|
||||||
k = math.sqrt(1 / conv_outdim)
|
k = math.sqrt(1 / conv_outdim)
|
||||||
self.conv_projection_1 = Conv1D(in_channels = conv_outdim,
|
self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
|
||||||
out_channels = hidden_size,
|
num_filters = hidden_size,
|
||||||
filter_size = 3,
|
filter_size = 3,
|
||||||
padding = int(np.floor(3/2)),
|
padding = int(np.floor(3/2)),
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
k = math.sqrt(1 / hidden_size)
|
k = math.sqrt(1 / hidden_size)
|
||||||
self.conv_projection_2 = Conv1D(in_channels = hidden_size,
|
self.conv_projection_2 = Conv1D(num_channels = hidden_size,
|
||||||
out_channels = projection_size,
|
num_filters = projection_size,
|
||||||
filter_size = 3,
|
filter_size = 3,
|
||||||
padding = int(np.floor(3/2)),
|
padding = int(np.floor(3/2)),
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
|
||||||
data_layout='NCHW')
|
data_layout='NCHW')
|
||||||
|
|
|
@ -17,24 +17,22 @@ class EncoderPrenet(dg.Layer):
|
||||||
padding_idx = None)
|
padding_idx = None)
|
||||||
self.conv_list = []
|
self.conv_list = []
|
||||||
k = math.sqrt(1 / embedding_size)
|
k = math.sqrt(1 / embedding_size)
|
||||||
self.conv_list.append(Conv1D(in_channels = embedding_size,
|
self.conv_list.append(Conv1D(num_channels = embedding_size,
|
||||||
out_channels = num_hidden,
|
num_filters = num_hidden,
|
||||||
filter_size = 5,
|
filter_size = 5,
|
||||||
padding = int(np.floor(5/2)),
|
padding = int(np.floor(5/2)),
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn))
|
||||||
data_format = "NCT"))
|
|
||||||
k = math.sqrt(1 / num_hidden)
|
k = math.sqrt(1 / num_hidden)
|
||||||
for _ in range(2):
|
for _ in range(2):
|
||||||
self.conv_list.append(Conv1D(in_channels = num_hidden,
|
self.conv_list.append(Conv1D(num_channels = num_hidden,
|
||||||
out_channels = num_hidden,
|
num_filters = num_hidden,
|
||||||
filter_size = 5,
|
filter_size = 5,
|
||||||
padding = int(np.floor(5/2)),
|
padding = int(np.floor(5/2)),
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn))
|
||||||
data_format = "NCT"))
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.conv_list):
|
for i, layer in enumerate(self.conv_list):
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||||
|
|
|
@ -22,34 +22,31 @@ class PostConvNet(dg.Layer):
|
||||||
self.batchnorm_last = batchnorm_last
|
self.batchnorm_last = batchnorm_last
|
||||||
self.conv_list = []
|
self.conv_list = []
|
||||||
k = math.sqrt(1 / (n_mels * outputs_per_step))
|
k = math.sqrt(1 / (n_mels * outputs_per_step))
|
||||||
self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step,
|
self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
|
||||||
out_channels = num_hidden,
|
num_filters = num_hidden,
|
||||||
filter_size = filter_size,
|
filter_size = filter_size,
|
||||||
padding = padding,
|
padding = padding,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn))
|
||||||
data_format = "NCT"))
|
|
||||||
|
|
||||||
k = math.sqrt(1 / num_hidden)
|
k = math.sqrt(1 / num_hidden)
|
||||||
for _ in range(1, num_conv-1):
|
for _ in range(1, num_conv-1):
|
||||||
self.conv_list.append(Conv1D(in_channels = num_hidden,
|
self.conv_list.append(Conv1D(num_channels = num_hidden,
|
||||||
out_channels = num_hidden,
|
num_filters = num_hidden,
|
||||||
filter_size = filter_size,
|
filter_size = filter_size,
|
||||||
padding = padding,
|
padding = padding,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn))
|
||||||
data_format = "NCT") )
|
|
||||||
|
|
||||||
self.conv_list.append(Conv1D(in_channels = num_hidden,
|
self.conv_list.append(Conv1D(num_channels = num_hidden,
|
||||||
out_channels = n_mels * outputs_per_step,
|
num_filters = n_mels * outputs_per_step,
|
||||||
filter_size = filter_size,
|
filter_size = filter_size,
|
||||||
padding = padding,
|
padding = padding,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn))
|
||||||
data_format = "NCT"))
|
|
||||||
|
|
||||||
for i, layer in enumerate(self.conv_list):
|
for i, layer in enumerate(self.conv_list):
|
||||||
self.add_sublayer("conv_list_{}".format(i), layer)
|
self.add_sublayer("conv_list_{}".format(i), layer)
|
||||||
|
|
|
@ -10,15 +10,13 @@ class Vocoder(dg.Layer):
|
||||||
"""
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(Vocoder, self).__init__()
|
super(Vocoder, self).__init__()
|
||||||
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
|
self.pre_proj = Conv1D(num_channels = config.audio.num_mels,
|
||||||
out_channels = config.hidden_size,
|
num_filters = config.hidden_size,
|
||||||
filter_size=1,
|
filter_size=1)
|
||||||
data_format = "NCT")
|
|
||||||
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
self.cbhg = CBHG(config.hidden_size, config.batch_size)
|
||||||
self.post_proj = Conv1D(in_channels = config.hidden_size,
|
self.post_proj = Conv1D(num_channels = config.hidden_size,
|
||||||
out_channels = (config.audio.n_fft // 2) + 1,
|
num_filters = (config.audio.n_fft // 2) + 1,
|
||||||
filter_size=1,
|
filter_size=1)
|
||||||
data_format = "NCT")
|
|
||||||
|
|
||||||
def forward(self, mel):
|
def forward(self, mel):
|
||||||
mel = layers.transpose(mel, [0,2,1])
|
mel = layers.transpose(mel, [0,2,1])
|
||||||
|
|
|
@ -14,23 +14,21 @@ class PositionwiseFeedForward(dg.Layer):
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
|
|
||||||
k = math.sqrt(1 / d_in)
|
k = math.sqrt(1 / d_in)
|
||||||
self.w_1 = Conv1D(in_channels = d_in,
|
self.w_1 = Conv1D(num_channels = d_in,
|
||||||
out_channels = num_hidden,
|
num_filters = num_hidden,
|
||||||
filter_size = filter_size,
|
filter_size = filter_size,
|
||||||
padding=padding,
|
padding=padding,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn)
|
||||||
data_format = "NTC")
|
|
||||||
k = math.sqrt(1 / num_hidden)
|
k = math.sqrt(1 / num_hidden)
|
||||||
self.w_2 = Conv1D(in_channels = num_hidden,
|
self.w_2 = Conv1D(num_channels = num_hidden,
|
||||||
out_channels = d_in,
|
num_filters = d_in,
|
||||||
filter_size = filter_size,
|
filter_size = filter_size,
|
||||||
padding=padding,
|
padding=padding,
|
||||||
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
|
||||||
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
|
||||||
use_cudnn = use_cudnn,
|
use_cudnn = use_cudnn)
|
||||||
data_format = "NTC")
|
|
||||||
self.layer_norm = dg.LayerNorm(d_in)
|
self.layer_norm = dg.LayerNorm(d_in)
|
||||||
|
|
||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
|
@ -42,12 +40,14 @@ class PositionwiseFeedForward(dg.Layer):
|
||||||
Returns:
|
Returns:
|
||||||
output (Variable), Shape(B, T, C), the result after FFN.
|
output (Variable), Shape(B, T, C), the result after FFN.
|
||||||
"""
|
"""
|
||||||
|
x = layers.transpose(input, [0,2,1])
|
||||||
#FFN Networt
|
#FFN Networt
|
||||||
x = self.w_2(layers.relu(self.w_1(input)))
|
x = self.w_2(layers.relu(self.w_1(x)))
|
||||||
|
|
||||||
# dropout
|
# dropout
|
||||||
x = layers.dropout(x, self.dropout)
|
x = layers.dropout(x, self.dropout)
|
||||||
|
|
||||||
|
x = layers.transpose(x, [0,2,1])
|
||||||
# residual connection
|
# residual connection
|
||||||
x = x + input
|
x = x + input
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue