Merge pull request #4316 from Topdu/release/2.3
pick fix nrtr export inference model from drgraph to release/2.3
This commit is contained in:
commit
30d908b6ca
|
@ -46,7 +46,7 @@ Architecture:
|
|||
name: Transformer
|
||||
d_model: 512
|
||||
num_encoder_layers: 6
|
||||
beam_size: 10 # When Beam size is greater than 0, it means to use beam search when evaluation.
|
||||
beam_size: -1 # When Beam size is greater than 0, it means to use beam search when evaluation.
|
||||
|
||||
|
||||
Loss:
|
||||
|
@ -65,7 +65,7 @@ Train:
|
|||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/training/
|
||||
transforms:
|
||||
- NRTRDecodeImage: # load image
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- NRTRLabelEncode: # Class handling label
|
||||
|
@ -85,7 +85,7 @@ Eval:
|
|||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/evaluation/
|
||||
transforms:
|
||||
- NRTRDecodeImage: # load image
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- NRTRLabelEncode: # Class handling label
|
||||
|
|
|
@ -174,21 +174,26 @@ class NRTRLabelEncode(BaseRecLabelEncode):
|
|||
super(NRTRLabelEncode,
|
||||
self).__init__(max_text_length, character_dict_path,
|
||||
character_type, use_space_char)
|
||||
|
||||
def __call__(self, data):
|
||||
text = data['label']
|
||||
text = self.encode(text)
|
||||
if text is None:
|
||||
return None
|
||||
if len(text) >= self.max_text_len - 1:
|
||||
return None
|
||||
data['length'] = np.array(len(text))
|
||||
text.insert(0, 2)
|
||||
text.append(3)
|
||||
text = text + [0] * (self.max_text_len - len(text))
|
||||
data['label'] = np.array(text)
|
||||
return data
|
||||
|
||||
def add_special_char(self, dict_character):
|
||||
dict_character = ['blank','<unk>','<s>','</s>'] + dict_character
|
||||
dict_character = ['blank', '<unk>', '<s>', '</s>'] + dict_character
|
||||
return dict_character
|
||||
|
||||
|
||||
class CTCLabelEncode(BaseRecLabelEncode):
|
||||
""" Convert between text-label and text-index """
|
||||
|
||||
|
|
|
@ -44,12 +44,33 @@ class ClsResizeImg(object):
|
|||
|
||||
|
||||
class NRTRRecResizeImg(object):
|
||||
def __init__(self, image_shape, resize_type, **kwargs):
|
||||
def __init__(self, image_shape, resize_type, padding=False, **kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.resize_type = resize_type
|
||||
self.padding = padding
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
image_shape = self.image_shape
|
||||
if self.padding:
|
||||
imgC, imgH, imgW = image_shape
|
||||
# todo: change to 0 and modified image shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
ratio = w / float(h)
|
||||
if math.ceil(imgH * ratio) > imgW:
|
||||
resized_w = imgW
|
||||
else:
|
||||
resized_w = int(math.ceil(imgH * ratio))
|
||||
resized_image = cv2.resize(img, (resized_w, imgH))
|
||||
norm_img = np.expand_dims(resized_image, -1)
|
||||
norm_img = norm_img.transpose((2, 0, 1))
|
||||
resized_image = norm_img.astype(np.float32) / 128. - 1.
|
||||
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
||||
padding_im[:, :, 0:resized_w] = resized_image
|
||||
data['image'] = padding_im
|
||||
return data
|
||||
if self.resize_type == 'PIL':
|
||||
image_pil = Image.fromarray(np.uint8(img))
|
||||
img = image_pil.resize(self.image_shape, Image.ANTIALIAS)
|
||||
|
|
|
@ -15,7 +15,6 @@ import numpy as np
|
|||
import os
|
||||
import random
|
||||
from paddle.io import Dataset
|
||||
|
||||
from .imaug import transform, create_operators
|
||||
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
from paddle import nn
|
||||
import paddle
|
||||
|
||||
|
||||
class MTB(nn.Layer):
|
||||
|
@ -40,7 +41,8 @@ class MTB(nn.Layer):
|
|||
x = self.block(images)
|
||||
if self.cnn_num == 2:
|
||||
# (b, w, h, c)
|
||||
x = x.transpose([0, 3, 2, 1])
|
||||
x_shape = x.shape
|
||||
x = x.reshape([x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
|
||||
x = paddle.transpose(x, [0, 3, 2, 1])
|
||||
x_shape = paddle.shape(x)
|
||||
x = paddle.reshape(
|
||||
x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
|
||||
return x
|
||||
|
|
|
@ -71,8 +71,6 @@ class MultiheadAttention(nn.Layer):
|
|||
value,
|
||||
key_padding_mask=None,
|
||||
incremental_state=None,
|
||||
need_weights=True,
|
||||
static_kv=False,
|
||||
attn_mask=None):
|
||||
"""
|
||||
Inputs of forward function
|
||||
|
@ -88,46 +86,42 @@ class MultiheadAttention(nn.Layer):
|
|||
attn_output: [target length, batch size, embed dim]
|
||||
attn_output_weights: [batch size, target length, sequence length]
|
||||
"""
|
||||
tgt_len, bsz, embed_dim = query.shape
|
||||
assert embed_dim == self.embed_dim
|
||||
assert list(query.shape) == [tgt_len, bsz, embed_dim]
|
||||
assert key.shape == value.shape
|
||||
|
||||
q_shape = paddle.shape(query)
|
||||
src_shape = paddle.shape(key)
|
||||
q = self._in_proj_q(query)
|
||||
k = self._in_proj_k(key)
|
||||
v = self._in_proj_v(value)
|
||||
q *= self.scaling
|
||||
|
||||
q = q.reshape([tgt_len, bsz * self.num_heads, self.head_dim]).transpose(
|
||||
[1, 0, 2])
|
||||
k = k.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose(
|
||||
[1, 0, 2])
|
||||
v = v.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose(
|
||||
[1, 0, 2])
|
||||
|
||||
src_len = k.shape[1]
|
||||
|
||||
q = paddle.transpose(
|
||||
paddle.reshape(
|
||||
q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
|
||||
[1, 2, 0, 3])
|
||||
k = paddle.transpose(
|
||||
paddle.reshape(
|
||||
k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
|
||||
[1, 2, 0, 3])
|
||||
v = paddle.transpose(
|
||||
paddle.reshape(
|
||||
v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
|
||||
[1, 2, 0, 3])
|
||||
if key_padding_mask is not None:
|
||||
assert key_padding_mask.shape[0] == bsz
|
||||
assert key_padding_mask.shape[1] == src_len
|
||||
|
||||
attn_output_weights = paddle.bmm(q, k.transpose([0, 2, 1]))
|
||||
assert list(attn_output_weights.
|
||||
shape) == [bsz * self.num_heads, tgt_len, src_len]
|
||||
|
||||
assert key_padding_mask.shape[0] == q_shape[1]
|
||||
assert key_padding_mask.shape[1] == src_shape[0]
|
||||
attn_output_weights = paddle.matmul(q,
|
||||
paddle.transpose(k, [0, 1, 3, 2]))
|
||||
if attn_mask is not None:
|
||||
attn_mask = attn_mask.unsqueeze(0)
|
||||
attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0)
|
||||
attn_output_weights += attn_mask
|
||||
if key_padding_mask is not None:
|
||||
attn_output_weights = attn_output_weights.reshape(
|
||||
[bsz, self.num_heads, tgt_len, src_len])
|
||||
key = key_padding_mask.unsqueeze(1).unsqueeze(2).astype('float32')
|
||||
y = paddle.full(shape=key.shape, dtype='float32', fill_value='-inf')
|
||||
attn_output_weights = paddle.reshape(
|
||||
attn_output_weights,
|
||||
[q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
|
||||
key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2)
|
||||
key = paddle.cast(key, 'float32')
|
||||
y = paddle.full(
|
||||
shape=paddle.shape(key), dtype='float32', fill_value='-inf')
|
||||
y = paddle.where(key == 0., key, y)
|
||||
attn_output_weights += y
|
||||
attn_output_weights = attn_output_weights.reshape(
|
||||
[bsz * self.num_heads, tgt_len, src_len])
|
||||
|
||||
attn_output_weights = F.softmax(
|
||||
attn_output_weights.astype('float32'),
|
||||
axis=-1,
|
||||
|
@ -136,43 +130,34 @@ class MultiheadAttention(nn.Layer):
|
|||
attn_output_weights = F.dropout(
|
||||
attn_output_weights, p=self.dropout, training=self.training)
|
||||
|
||||
attn_output = paddle.bmm(attn_output_weights, v)
|
||||
assert list(attn_output.
|
||||
shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
|
||||
attn_output = attn_output.transpose([1, 0, 2]).reshape(
|
||||
[tgt_len, bsz, embed_dim])
|
||||
attn_output = paddle.matmul(attn_output_weights, v)
|
||||
attn_output = paddle.reshape(
|
||||
paddle.transpose(attn_output, [2, 0, 1, 3]),
|
||||
[q_shape[0], q_shape[1], self.embed_dim])
|
||||
attn_output = self.out_proj(attn_output)
|
||||
|
||||
if need_weights:
|
||||
# average attention weights over heads
|
||||
attn_output_weights = attn_output_weights.reshape(
|
||||
[bsz, self.num_heads, tgt_len, src_len])
|
||||
attn_output_weights = attn_output_weights.sum(
|
||||
axis=1) / self.num_heads
|
||||
else:
|
||||
attn_output_weights = None
|
||||
return attn_output, attn_output_weights
|
||||
return attn_output
|
||||
|
||||
def _in_proj_q(self, query):
|
||||
query = query.transpose([1, 2, 0])
|
||||
query = paddle.transpose(query, [1, 2, 0])
|
||||
query = paddle.unsqueeze(query, axis=2)
|
||||
res = self.conv1(query)
|
||||
res = paddle.squeeze(res, axis=2)
|
||||
res = res.transpose([2, 0, 1])
|
||||
res = paddle.transpose(res, [2, 0, 1])
|
||||
return res
|
||||
|
||||
def _in_proj_k(self, key):
|
||||
key = key.transpose([1, 2, 0])
|
||||
key = paddle.transpose(key, [1, 2, 0])
|
||||
key = paddle.unsqueeze(key, axis=2)
|
||||
res = self.conv2(key)
|
||||
res = paddle.squeeze(res, axis=2)
|
||||
res = res.transpose([2, 0, 1])
|
||||
res = paddle.transpose(res, [2, 0, 1])
|
||||
return res
|
||||
|
||||
def _in_proj_v(self, value):
|
||||
value = value.transpose([1, 2, 0]) #(1, 2, 0)
|
||||
value = paddle.transpose(value, [1, 2, 0]) #(1, 2, 0)
|
||||
value = paddle.unsqueeze(value, axis=2)
|
||||
res = self.conv3(value)
|
||||
res = paddle.squeeze(res, axis=2)
|
||||
res = res.transpose([2, 0, 1])
|
||||
res = paddle.transpose(res, [2, 0, 1])
|
||||
return res
|
||||
|
|
|
@ -61,12 +61,12 @@ class Transformer(nn.Layer):
|
|||
custom_decoder=None,
|
||||
in_channels=0,
|
||||
out_channels=0,
|
||||
dst_vocab_size=99,
|
||||
scale_embedding=True):
|
||||
super(Transformer, self).__init__()
|
||||
self.out_channels = out_channels + 1
|
||||
self.embedding = Embeddings(
|
||||
d_model=d_model,
|
||||
vocab=dst_vocab_size,
|
||||
vocab=self.out_channels,
|
||||
padding_idx=0,
|
||||
scale_embedding=scale_embedding)
|
||||
self.positional_encoding = PositionalEncoding(
|
||||
|
@ -96,9 +96,10 @@ class Transformer(nn.Layer):
|
|||
self.beam_size = beam_size
|
||||
self.d_model = d_model
|
||||
self.nhead = nhead
|
||||
self.tgt_word_prj = nn.Linear(d_model, dst_vocab_size, bias_attr=False)
|
||||
self.tgt_word_prj = nn.Linear(
|
||||
d_model, self.out_channels, bias_attr=False)
|
||||
w0 = np.random.normal(0.0, d_model**-0.5,
|
||||
(d_model, dst_vocab_size)).astype(np.float32)
|
||||
(d_model, self.out_channels)).astype(np.float32)
|
||||
self.tgt_word_prj.weight.set_value(w0)
|
||||
self.apply(self._init_weights)
|
||||
|
||||
|
@ -156,46 +157,41 @@ class Transformer(nn.Layer):
|
|||
return self.forward_test(src)
|
||||
|
||||
def forward_test(self, src):
|
||||
bs = src.shape[0]
|
||||
bs = paddle.shape(src)[0]
|
||||
if self.encoder is not None:
|
||||
src = self.positional_encoding(src.transpose([1, 0, 2]))
|
||||
src = self.positional_encoding(paddle.transpose(src, [1, 0, 2]))
|
||||
memory = self.encoder(src)
|
||||
else:
|
||||
memory = src.squeeze(2).transpose([2, 0, 1])
|
||||
memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1])
|
||||
dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64)
|
||||
dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32)
|
||||
for len_dec_seq in range(1, 25):
|
||||
src_enc = memory.clone()
|
||||
tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
|
||||
dec_seq_embed = self.embedding(dec_seq).transpose([1, 0, 2])
|
||||
dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
|
||||
dec_seq_embed = self.positional_encoding(dec_seq_embed)
|
||||
tgt_mask = self.generate_square_subsequent_mask(dec_seq_embed.shape[
|
||||
0])
|
||||
tgt_mask = self.generate_square_subsequent_mask(
|
||||
paddle.shape(dec_seq_embed)[0])
|
||||
output = self.decoder(
|
||||
dec_seq_embed,
|
||||
src_enc,
|
||||
memory,
|
||||
tgt_mask=tgt_mask,
|
||||
memory_mask=None,
|
||||
tgt_key_padding_mask=tgt_key_padding_mask,
|
||||
tgt_key_padding_mask=None,
|
||||
memory_key_padding_mask=None)
|
||||
dec_output = output.transpose([1, 0, 2])
|
||||
|
||||
dec_output = dec_output[:,
|
||||
-1, :] # Pick the last step: (bh * bm) * d_h
|
||||
word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
|
||||
word_prob = word_prob.reshape([1, bs, -1])
|
||||
preds_idx = word_prob.argmax(axis=2)
|
||||
|
||||
dec_output = paddle.transpose(output, [1, 0, 2])
|
||||
dec_output = dec_output[:, -1, :]
|
||||
word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
|
||||
preds_idx = paddle.argmax(word_prob, axis=1)
|
||||
if paddle.equal_all(
|
||||
preds_idx[-1],
|
||||
preds_idx,
|
||||
paddle.full(
|
||||
preds_idx[-1].shape, 3, dtype='int64')):
|
||||
paddle.shape(preds_idx), 3, dtype='int64')):
|
||||
break
|
||||
|
||||
preds_prob = word_prob.max(axis=2)
|
||||
preds_prob = paddle.max(word_prob, axis=1)
|
||||
dec_seq = paddle.concat(
|
||||
[dec_seq, preds_idx.reshape([-1, 1])], axis=1)
|
||||
|
||||
return dec_seq
|
||||
[dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1)
|
||||
dec_prob = paddle.concat(
|
||||
[dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1)
|
||||
return [dec_seq, dec_prob]
|
||||
|
||||
def forward_beam(self, images):
|
||||
''' Translation work in one batch '''
|
||||
|
@ -211,14 +207,15 @@ class Transformer(nn.Layer):
|
|||
n_prev_active_inst, n_bm):
|
||||
''' Collect tensor parts associated to active instances. '''
|
||||
|
||||
_, *d_hs = beamed_tensor.shape
|
||||
beamed_tensor_shape = paddle.shape(beamed_tensor)
|
||||
n_curr_active_inst = len(curr_active_inst_idx)
|
||||
new_shape = (n_curr_active_inst * n_bm, *d_hs)
|
||||
new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1],
|
||||
beamed_tensor_shape[2])
|
||||
|
||||
beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1])
|
||||
beamed_tensor = beamed_tensor.index_select(
|
||||
paddle.to_tensor(curr_active_inst_idx), axis=0)
|
||||
beamed_tensor = beamed_tensor.reshape([*new_shape])
|
||||
curr_active_inst_idx, axis=0)
|
||||
beamed_tensor = beamed_tensor.reshape(new_shape)
|
||||
|
||||
return beamed_tensor
|
||||
|
||||
|
@ -249,44 +246,26 @@ class Transformer(nn.Layer):
|
|||
b.get_current_state() for b in inst_dec_beams if not b.done
|
||||
]
|
||||
dec_partial_seq = paddle.stack(dec_partial_seq)
|
||||
|
||||
dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq])
|
||||
return dec_partial_seq
|
||||
|
||||
def prepare_beam_memory_key_padding_mask(
|
||||
inst_dec_beams, memory_key_padding_mask, n_bm):
|
||||
keep = []
|
||||
for idx in (memory_key_padding_mask):
|
||||
if not inst_dec_beams[idx].done:
|
||||
keep.append(idx)
|
||||
memory_key_padding_mask = memory_key_padding_mask[
|
||||
paddle.to_tensor(keep)]
|
||||
len_s = memory_key_padding_mask.shape[-1]
|
||||
n_inst = memory_key_padding_mask.shape[0]
|
||||
memory_key_padding_mask = paddle.concat(
|
||||
[memory_key_padding_mask for i in range(n_bm)], axis=1)
|
||||
memory_key_padding_mask = memory_key_padding_mask.reshape(
|
||||
[n_inst * n_bm, len_s]) #repeat(1, n_bm)
|
||||
return memory_key_padding_mask
|
||||
|
||||
def predict_word(dec_seq, enc_output, n_active_inst, n_bm,
|
||||
memory_key_padding_mask):
|
||||
tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
|
||||
dec_seq = self.embedding(dec_seq).transpose([1, 0, 2])
|
||||
dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
|
||||
dec_seq = self.positional_encoding(dec_seq)
|
||||
tgt_mask = self.generate_square_subsequent_mask(dec_seq.shape[
|
||||
0])
|
||||
tgt_mask = self.generate_square_subsequent_mask(
|
||||
paddle.shape(dec_seq)[0])
|
||||
dec_output = self.decoder(
|
||||
dec_seq,
|
||||
enc_output,
|
||||
tgt_mask=tgt_mask,
|
||||
tgt_key_padding_mask=tgt_key_padding_mask,
|
||||
memory_key_padding_mask=memory_key_padding_mask,
|
||||
).transpose([1, 0, 2])
|
||||
tgt_key_padding_mask=None,
|
||||
memory_key_padding_mask=memory_key_padding_mask, )
|
||||
dec_output = paddle.transpose(dec_output, [1, 0, 2])
|
||||
dec_output = dec_output[:,
|
||||
-1, :] # Pick the last step: (bh * bm) * d_h
|
||||
word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
|
||||
word_prob = word_prob.reshape([n_active_inst, n_bm, -1])
|
||||
word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
|
||||
word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1])
|
||||
return word_prob
|
||||
|
||||
def collect_active_inst_idx_list(inst_beams, word_prob,
|
||||
|
@ -302,9 +281,8 @@ class Transformer(nn.Layer):
|
|||
|
||||
n_active_inst = len(inst_idx_to_position_map)
|
||||
dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
|
||||
memory_key_padding_mask = None
|
||||
word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm,
|
||||
memory_key_padding_mask)
|
||||
None)
|
||||
# Update the beam with predicted word prob information and collect incomplete instances
|
||||
active_inst_idx_list = collect_active_inst_idx_list(
|
||||
inst_dec_beams, word_prob, inst_idx_to_position_map)
|
||||
|
@ -324,27 +302,21 @@ class Transformer(nn.Layer):
|
|||
|
||||
with paddle.no_grad():
|
||||
#-- Encode
|
||||
|
||||
if self.encoder is not None:
|
||||
src = self.positional_encoding(images.transpose([1, 0, 2]))
|
||||
src_enc = self.encoder(src).transpose([1, 0, 2])
|
||||
src_enc = self.encoder(src)
|
||||
else:
|
||||
src_enc = images.squeeze(2).transpose([0, 2, 1])
|
||||
|
||||
#-- Repeat data for beam search
|
||||
n_bm = self.beam_size
|
||||
n_inst, len_s, d_h = src_enc.shape
|
||||
src_enc = paddle.concat([src_enc for i in range(n_bm)], axis=1)
|
||||
src_enc = src_enc.reshape([n_inst * n_bm, len_s, d_h]).transpose(
|
||||
[1, 0, 2])
|
||||
#-- Prepare beams
|
||||
inst_dec_beams = [Beam(n_bm) for _ in range(n_inst)]
|
||||
|
||||
#-- Bookkeeping for active or not
|
||||
active_inst_idx_list = list(range(n_inst))
|
||||
src_shape = paddle.shape(src_enc)
|
||||
inst_dec_beams = [Beam(n_bm) for _ in range(1)]
|
||||
active_inst_idx_list = list(range(1))
|
||||
# Repeat data for beam search
|
||||
src_enc = paddle.tile(src_enc, [1, n_bm, 1])
|
||||
inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
|
||||
active_inst_idx_list)
|
||||
#-- Decode
|
||||
# Decode
|
||||
for len_dec_seq in range(1, 25):
|
||||
src_enc_copy = src_enc.clone()
|
||||
active_inst_idx_list = beam_decode_step(
|
||||
|
@ -358,10 +330,19 @@ class Transformer(nn.Layer):
|
|||
batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams,
|
||||
1)
|
||||
result_hyp = []
|
||||
for bs_hyp in batch_hyp:
|
||||
bs_hyp_pad = bs_hyp[0] + [3] * (25 - len(bs_hyp[0]))
|
||||
hyp_scores = []
|
||||
for bs_hyp, score in zip(batch_hyp, batch_scores):
|
||||
l = len(bs_hyp[0])
|
||||
bs_hyp_pad = bs_hyp[0] + [3] * (25 - l)
|
||||
result_hyp.append(bs_hyp_pad)
|
||||
return paddle.to_tensor(np.array(result_hyp), dtype=paddle.int64)
|
||||
score = float(score) / l
|
||||
hyp_score = [score for _ in range(25)]
|
||||
hyp_scores.append(hyp_score)
|
||||
return [
|
||||
paddle.to_tensor(
|
||||
np.array(result_hyp), dtype=paddle.int64),
|
||||
paddle.to_tensor(hyp_scores)
|
||||
]
|
||||
|
||||
def generate_square_subsequent_mask(self, sz):
|
||||
"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
|
||||
|
@ -376,7 +357,7 @@ class Transformer(nn.Layer):
|
|||
return mask
|
||||
|
||||
def generate_padding_mask(self, x):
|
||||
padding_mask = x.equal(paddle.to_tensor(0, dtype=x.dtype))
|
||||
padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype))
|
||||
return padding_mask
|
||||
|
||||
def _reset_parameters(self):
|
||||
|
@ -514,17 +495,17 @@ class TransformerEncoderLayer(nn.Layer):
|
|||
src,
|
||||
src,
|
||||
attn_mask=src_mask,
|
||||
key_padding_mask=src_key_padding_mask)[0]
|
||||
key_padding_mask=src_key_padding_mask)
|
||||
src = src + self.dropout1(src2)
|
||||
src = self.norm1(src)
|
||||
|
||||
src = src.transpose([1, 2, 0])
|
||||
src = paddle.transpose(src, [1, 2, 0])
|
||||
src = paddle.unsqueeze(src, 2)
|
||||
src2 = self.conv2(F.relu(self.conv1(src)))
|
||||
src2 = paddle.squeeze(src2, 2)
|
||||
src2 = src2.transpose([2, 0, 1])
|
||||
src2 = paddle.transpose(src2, [2, 0, 1])
|
||||
src = paddle.squeeze(src, 2)
|
||||
src = src.transpose([2, 0, 1])
|
||||
src = paddle.transpose(src, [2, 0, 1])
|
||||
|
||||
src = src + self.dropout2(src2)
|
||||
src = self.norm2(src)
|
||||
|
@ -598,7 +579,7 @@ class TransformerDecoderLayer(nn.Layer):
|
|||
tgt,
|
||||
tgt,
|
||||
attn_mask=tgt_mask,
|
||||
key_padding_mask=tgt_key_padding_mask)[0]
|
||||
key_padding_mask=tgt_key_padding_mask)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
tgt2 = self.multihead_attn(
|
||||
|
@ -606,18 +587,18 @@ class TransformerDecoderLayer(nn.Layer):
|
|||
memory,
|
||||
memory,
|
||||
attn_mask=memory_mask,
|
||||
key_padding_mask=memory_key_padding_mask)[0]
|
||||
key_padding_mask=memory_key_padding_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# default
|
||||
tgt = tgt.transpose([1, 2, 0])
|
||||
tgt = paddle.transpose(tgt, [1, 2, 0])
|
||||
tgt = paddle.unsqueeze(tgt, 2)
|
||||
tgt2 = self.conv2(F.relu(self.conv1(tgt)))
|
||||
tgt2 = paddle.squeeze(tgt2, 2)
|
||||
tgt2 = tgt2.transpose([2, 0, 1])
|
||||
tgt2 = paddle.transpose(tgt2, [2, 0, 1])
|
||||
tgt = paddle.squeeze(tgt, 2)
|
||||
tgt = tgt.transpose([2, 0, 1])
|
||||
tgt = paddle.transpose(tgt, [2, 0, 1])
|
||||
|
||||
tgt = tgt + self.dropout3(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
|
@ -656,8 +637,8 @@ class PositionalEncoding(nn.Layer):
|
|||
(-math.log(10000.0) / dim))
|
||||
pe[:, 0::2] = paddle.sin(position * div_term)
|
||||
pe[:, 1::2] = paddle.cos(position * div_term)
|
||||
pe = pe.unsqueeze(0)
|
||||
pe = pe.transpose([1, 0, 2])
|
||||
pe = paddle.unsqueeze(pe, 0)
|
||||
pe = paddle.transpose(pe, [1, 0, 2])
|
||||
self.register_buffer('pe', pe)
|
||||
|
||||
def forward(self, x):
|
||||
|
@ -670,7 +651,7 @@ class PositionalEncoding(nn.Layer):
|
|||
Examples:
|
||||
>>> output = pos_encoder(x)
|
||||
"""
|
||||
x = x + self.pe[:x.shape[0], :]
|
||||
x = x + self.pe[:paddle.shape(x)[0], :]
|
||||
return self.dropout(x)
|
||||
|
||||
|
||||
|
@ -702,7 +683,7 @@ class PositionalEncoding_2d(nn.Layer):
|
|||
(-math.log(10000.0) / dim))
|
||||
pe[:, 0::2] = paddle.sin(position * div_term)
|
||||
pe[:, 1::2] = paddle.cos(position * div_term)
|
||||
pe = pe.unsqueeze(0).transpose([1, 0, 2])
|
||||
pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2])
|
||||
self.register_buffer('pe', pe)
|
||||
|
||||
self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1))
|
||||
|
@ -722,22 +703,23 @@ class PositionalEncoding_2d(nn.Layer):
|
|||
Examples:
|
||||
>>> output = pos_encoder(x)
|
||||
"""
|
||||
w_pe = self.pe[:x.shape[-1], :]
|
||||
w_pe = self.pe[:paddle.shape(x)[-1], :]
|
||||
w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0)
|
||||
w_pe = w_pe * w1
|
||||
w_pe = w_pe.transpose([1, 2, 0])
|
||||
w_pe = w_pe.unsqueeze(2)
|
||||
w_pe = paddle.transpose(w_pe, [1, 2, 0])
|
||||
w_pe = paddle.unsqueeze(w_pe, 2)
|
||||
|
||||
h_pe = self.pe[:x.shape[-2], :]
|
||||
h_pe = self.pe[:paddle.shape(x).shape[-2], :]
|
||||
w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0)
|
||||
h_pe = h_pe * w2
|
||||
h_pe = h_pe.transpose([1, 2, 0])
|
||||
h_pe = h_pe.unsqueeze(3)
|
||||
h_pe = paddle.transpose(h_pe, [1, 2, 0])
|
||||
h_pe = paddle.unsqueeze(h_pe, 3)
|
||||
|
||||
x = x + w_pe + h_pe
|
||||
x = x.reshape(
|
||||
[x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]).transpose(
|
||||
[2, 0, 1])
|
||||
x = paddle.transpose(
|
||||
paddle.reshape(x,
|
||||
[x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]),
|
||||
[2, 0, 1])
|
||||
|
||||
return self.dropout(x)
|
||||
|
||||
|
@ -817,7 +799,7 @@ class Beam():
|
|||
def sort_scores(self):
|
||||
"Sort the scores."
|
||||
return self.scores, paddle.to_tensor(
|
||||
[i for i in range(self.scores.shape[0])], dtype='int32')
|
||||
[i for i in range(int(self.scores.shape[0]))], dtype='int32')
|
||||
|
||||
def get_the_best_score_and_idx(self):
|
||||
"Get the score of the best in the beam."
|
||||
|
|
|
@ -176,7 +176,19 @@ class NRTRLabelDecode(BaseRecLabelDecode):
|
|||
else:
|
||||
preds_idx = preds
|
||||
|
||||
text = self.decode(preds_idx)
|
||||
if len(preds) == 2:
|
||||
preds_id = preds[0]
|
||||
preds_prob = preds[1]
|
||||
if isinstance(preds_id, paddle.Tensor):
|
||||
preds_id = preds_id.numpy()
|
||||
if isinstance(preds_prob, paddle.Tensor):
|
||||
preds_prob = preds_prob.numpy()
|
||||
if preds_id[0][0] == 2:
|
||||
preds_idx = preds_id[:, 1:]
|
||||
preds_prob = preds_prob[:, 1:]
|
||||
else:
|
||||
preds_idx = preds_id
|
||||
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
|
||||
if label is None:
|
||||
return text
|
||||
label = self.decode(label[:,1:])
|
||||
|
|
|
@ -60,6 +60,8 @@ def export_single_model(model, arch_config, save_path, logger):
|
|||
"When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training"
|
||||
)
|
||||
infer_shape[-1] = 100
|
||||
if arch_config["algorithm"] == "NRTR":
|
||||
infer_shape = [1, 32, 100]
|
||||
elif arch_config["model_type"] == "table":
|
||||
infer_shape = [3, 488, 488]
|
||||
model = to_static(
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
# limitations under the License.
|
||||
import os
|
||||
import sys
|
||||
|
||||
from PIL import Image
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
@ -61,6 +61,13 @@ class TextRecognizer(object):
|
|||
"character_dict_path": args.rec_char_dict_path,
|
||||
"use_space_char": args.use_space_char
|
||||
}
|
||||
elif self.rec_algorithm == 'NRTR':
|
||||
postprocess_params = {
|
||||
'name': 'NRTRLabelDecode',
|
||||
"character_type": args.rec_char_type,
|
||||
"character_dict_path": args.rec_char_dict_path,
|
||||
"use_space_char": args.use_space_char
|
||||
}
|
||||
self.postprocess_op = build_post_process(postprocess_params)
|
||||
self.predictor, self.input_tensor, self.output_tensors, self.config = \
|
||||
utility.create_predictor(args, 'rec', logger)
|
||||
|
@ -87,6 +94,16 @@ class TextRecognizer(object):
|
|||
|
||||
def resize_norm_img(self, img, max_wh_ratio):
|
||||
imgC, imgH, imgW = self.rec_image_shape
|
||||
if self.rec_algorithm == 'NRTR':
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
# return padding_im
|
||||
image_pil = Image.fromarray(np.uint8(img))
|
||||
img = image_pil.resize([100, 32], Image.ANTIALIAS)
|
||||
img = np.array(img)
|
||||
norm_img = np.expand_dims(img, -1)
|
||||
norm_img = norm_img.transpose((2, 0, 1))
|
||||
return norm_img.astype(np.float32) / 128. - 1.
|
||||
|
||||
assert imgC == img.shape[2]
|
||||
max_wh_ratio = max(max_wh_ratio, imgW / imgH)
|
||||
imgW = int((32 * max_wh_ratio))
|
||||
|
@ -252,14 +269,16 @@ class TextRecognizer(object):
|
|||
else:
|
||||
self.input_tensor.copy_from_cpu(norm_img_batch)
|
||||
self.predictor.run()
|
||||
|
||||
outputs = []
|
||||
for output_tensor in self.output_tensors:
|
||||
output = output_tensor.copy_to_cpu()
|
||||
outputs.append(output)
|
||||
if self.benchmark:
|
||||
self.autolog.times.stamp()
|
||||
preds = outputs[0]
|
||||
if len(outputs) != 1:
|
||||
preds = outputs
|
||||
else:
|
||||
preds = outputs[0]
|
||||
rec_result = self.postprocess_op(preds)
|
||||
for rno in range(len(rec_result)):
|
||||
rec_res[indices[beg_img_no + rno]] = rec_result[rno]
|
||||
|
|
Loading…
Reference in New Issue