From 7b9e9c7a674b01c771a8cc33aec59f1b4b0d7089 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Thu, 31 Dec 2020 16:52:21 +0800
Subject: [PATCH] bug fix: apply dropout to logits before softmax

---
 parakeet/modules/attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parakeet/modules/attention.py b/parakeet/modules/attention.py
index aaf0b55..ea6b986 100644
--- a/parakeet/modules/attention.py
+++ b/parakeet/modules/attention.py
@@ -61,8 +61,8 @@ def scaled_dot_product_attention(q,
     if mask is not None:
         scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
 
+    scaled_logit = F.dropout(scaled_logit, dropout, training=training)
     attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
     out = paddle.matmul(attn_weights, v)
     return out, attn_weights