bug fix: apply dropout to logits before softmax
This commit is contained in:
parent
737b09d03c
commit
7b9e9c7a67
|
@ -61,8 +61,8 @@ def scaled_dot_product_attention(q,
|
|||
if mask is not None:
|
||||
scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
|
||||
|
||||
scaled_logit = F.dropout(scaled_logit, dropout, training=training)
|
||||
attn_weights = F.softmax(scaled_logit, axis=-1)
|
||||
attn_weights = F.dropout(attn_weights, dropout, training=training)
|
||||
out = paddle.matmul(attn_weights, v)
|
||||
return out, attn_weights
|
||||
|
||||
|
|
Loading…
Reference in New Issue