audio: num_mels: 80 #the number of mel bands when calculating mel spectrograms. n_fft: 2048 #the number of fft components. sr: 22050 #the sampling rate of audio data file. preemphasis: 0.97 #the preemphasis coefficient. hop_length: 256 #the number of samples to advance between frames. win_length: 1024 #the length (width) of the window function. power: 1.2 #the power to raise before griffin-lim. min_level_db: -100 #the minimum level db. ref_level_db: 20 #the reference level db. outputs_per_step: 1 #the outputs per step. encoder_n_layer: 6 #the number of FFT Block in encoder. encoder_head: 2 #the attention head number in encoder. encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder. max_seq_len: 2048 #the max length of sequence. decoder_n_layer: 6 #the number of FFT Block in decoder. decoder_head: 2 #the attention head number in decoder. decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder. fs_hidden_size: 384 #the hidden size in model of fastspeech. duration_predictor_output_size: 256 #the output size of duration predictior. duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction. fft_conv1d_filter: 3 #the filter size of conv1d in fft. fft_conv1d_padding: 1 #the padding size of conv1d in fft. dropout: 0.1 #the dropout in network. transformer_head: 4 #the attention head num of transformerTTS. embedding_size: 512 #the dim size of embedding of transformerTTS. hidden_size: 256 #the hidden size in model of transformerTTS. warm_up_step: 4000 #the warm up step of learning rate. grad_clip_thresh: 0.1 #the threshold of grad clip.