46 lines
686 B
YAML
46 lines
686 B
YAML
# data processing
|
|
p_pronunciation: 0.99
|
|
sample_rate: 22050 # Hz
|
|
n_fft: 1024
|
|
win_length: 1024
|
|
hop_length: 256
|
|
n_mels: 80
|
|
reduction_factor: 4
|
|
|
|
# model-s2s
|
|
n_speakers: 1
|
|
speaker_dim: 16
|
|
char_dim: 256
|
|
encoder_dim: 64
|
|
kernel_size: 5
|
|
encoder_layers: 7
|
|
decoder_layers: 8
|
|
prenet_sizes: [128]
|
|
attention_dim: 128
|
|
|
|
# model-postnet
|
|
postnet_layers: 5
|
|
postnet_dim: 256
|
|
|
|
# position embedding
|
|
position_weight: 1.0
|
|
position_rate: 5.54
|
|
forward_step: 4
|
|
backward_step: 0
|
|
|
|
dropout: 0.05
|
|
|
|
# output-griffinlim
|
|
sharpening_factor: 1.4
|
|
|
|
# optimizer:
|
|
learning_rate: 0.001
|
|
clip_value: 5.0
|
|
clip_norm: 100.0
|
|
|
|
# training:
|
|
max_iteration: 1000000
|
|
batch_size: 16
|
|
report_interval: 10000
|
|
save_interval: 10000
|
|
valid_size: 5 |