bug fix: apply dropout to logits before softmax
This commit is contained in:
parent
737b09d03c
commit
7b9e9c7a67
|
@ -61,8 +61,8 @@ def scaled_dot_product_attention(q,
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
|
scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
|
||||||
|
|
||||||
|
scaled_logit = F.dropout(scaled_logit, dropout, training=training)
|
||||||
attn_weights = F.softmax(scaled_logit, axis=-1)
|
attn_weights = F.softmax(scaled_logit, axis=-1)
|
||||||
attn_weights = F.dropout(attn_weights, dropout, training=training)
|
|
||||||
out = paddle.matmul(attn_weights, v)
|
out = paddle.matmul(attn_weights, v)
|
||||||
return out, attn_weights
|
return out, attn_weights
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue