import math import paddle from paddle.nn import functional as F def positional_encoding(start_index, length, size, dtype=None): """ Generate standard positional encoding. pe(pos, 2i) = sin(pos / 10000 ** (2i / size)) pe(pos, 2i+1) = cos(pos / 10000 ** (2i / size)) This implementation deviates from the standard implementation in that the sin/cos channels are not interleaved. Args: start_index (int): the start index. length (int): the length of the positional encoding. size (int): positional encoding dimension. Returns: encodings (Tensor): shape(length, size), the positional encoding. """ if (size % 2 != 0): raise ValueError("size should be divisible by 2") dtype = dtype or paddle.get_default_dtype() channel = paddle.arange(0, size, 2, dtype=dtype) index = paddle.arange(start_index, start_index + length, 1, dtype=dtype) p = paddle.unsqueeze(index, -1) / (10000 ** (channel / float(size))) encodings = paddle.concat([paddle.sin(p), paddle.cos(p)], axis=-1) return encodings def scalable_positional_encoding(start_index, length, size, omega): """ A scalable positional encoding, which extends the standard positional encoding by adding positioning rate (denoted as omega). pe(pos, 2i) = sin(omega * pos / 10000 ** (2i / size)) pe(pos, 2i+1) = cos(omega * pos / 10000 ** (2i / size)) This implementation deviates from the standard implementation in that the sin/cos channels are not interleaved. Args: start_index (int): the start index. length (int): the length of the positional encoding. size (int): positional encoding dimension. omgea (Tensor): shape(batch_size, ), positional rates. Returns: encodings: shape(batch_size, length, size), position embedding, the data type is the same as omega. """ dtype = omega.dtype index = paddle.arange(start_index, start_index + length, 1, dtype=dtype) channel = paddle.arange(0, size, 2, dtype=dtype) p = paddle.unsqueeze(omega, [1, 2]) \ * paddle.unsqueeze(index, [1]) \ / (10000 ** (channel / float(size))) encodings = paddle.concat([paddle.sin(p), paddle.cos(p)], axis=-1) return encodings