deepke/conf/model/transformer.yaml

12 lines
358 B
YAML

model_name: transformer
hidden_size: ??? # 使用 embedding 输出的结果,不需要指定
num_heads: 4 # 必须能被 hidden_size 整除
num_hidden_layers: 3
intermediate_size: 256
dropout: 0.1
layer_norm_eps: 1e-12
hidden_act: gelu_new # [relu, gelu, swish, gelu_new]
output_attentions: True
output_hidden_states: True