Add files via upload

KonwPrompt_2
This commit is contained in:
TimelordRi 2021-09-26 10:31:21 +08:00 committed by GitHub
parent 7d487e3441
commit 0b3f5f7d06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 14170 additions and 0 deletions

View File

@ -0,0 +1,2 @@
from .transformer import *
from .base import BaseLitModel

View File

@ -0,0 +1,122 @@
import argparse
import torch
from transformers.optimization import get_linear_schedule_with_warmup
from torch import nn
OPTIMIZER = "AdamW"
LR = 5e-5
LOSS = "cross_entropy"
ONE_CYCLE_TOTAL_STEPS = 100
class Config(dict):
def __getattr__(self, name):
return self.get(name)
def __setattr__(self, name, val):
self[name] = val
class BaseLitModel(nn.Module):
"""
Generic PyTorch-Lightning class that must be initialized with a PyTorch module.
"""
def __init__(self, model, args: argparse.Namespace = None, device: torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') = str):
super().__init__()
self.model = model
self.cur_model = model.module if hasattr(model, 'module') else model
self.device = device
self.args = Config(vars(args)) if args is not None else {}
optimizer = self.args.get("optimizer", OPTIMIZER)
self.optimizer_class = getattr(torch.optim, optimizer)
self.lr = self.args.get("lr", LR)
@staticmethod
def add_to_argparse(parser):
parser.add_argument("--optimizer", type=str, default=OPTIMIZER, help="optimizer class from torch.optim")
parser.add_argument("--lr", type=float, default=LR)
parser.add_argument("--weight_decay", type=float, default=0.01)
return parser
def configure_optimizers(self):
optimizer = self.optimizer_class(self.parameters(), lr=self.lr)
if self.one_cycle_max_lr is None:
return optimizer
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=self.one_cycle_max_lr, total_steps=self.one_cycle_total_steps)
return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
x, y = batch
x.to(self.device)
logits = x
loss = (logits - y) ** 2
print("train_loss: ", loss)
#self.train_acc(logits, y)
#self.log("train_acc", self.train_acc, on_step=False, on_epoch=True)
return loss
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
x, y = batch
x.to(self.device)
logits = x
loss = (logits - y) ** 2
print("val_loss: ", loss)
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
x, y = batch
x.to(self.device)
logits = x
loss = (logits - y) ** 2
print("test_loss: ", loss)
def configure_optimizers(self):
no_decay_param = ["bias", "LayerNorm.weight"]
optimizer_group_parameters = [
{"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay_param)], "weight_decay": self.args.weight_decay},
{"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay_param)], "weight_decay": 0}
]
optimizer = self.optimizer_class(optimizer_group_parameters, lr=self.lr, eps=1e-8)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.num_training_steps * 0.1, num_training_steps=self.num_training_steps)
return optimizer
'''return {
"optimizer": optimizer,
"lr_scheduler":{
'scheduler': scheduler,
'interval': 'step', # or 'epoch'
'frequency': 1,
}
}'''
@property
def num_training_steps(self) -> int:
"""Total training steps inferred from datamodule and devices."""
if isinstance(self.trainer.limit_train_batches, int) and self.trainer.limit_train_batches != 0:
dataset_size = self.trainer.limit_train_batches
elif isinstance(self.trainer.limit_train_batches, float):
# limit_train_batches is a percentage of batches
dataset_size = len(self.trainer.datamodule.train_dataloader())
dataset_size = int(dataset_size * self.trainer.limit_train_batches)
else:
dataset_size = len(self.trainer.datamodule.train_dataloader())
num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)
if self.trainer.tpu_cores:
num_devices = max(num_devices, self.trainer.tpu_cores)
effective_batch_size = self.trainer.accumulate_grad_batches * num_devices
max_estimated_steps = (dataset_size // effective_batch_size) * self.trainer.max_epochs
if self.trainer.max_steps and self.trainer.max_steps < max_estimated_steps:
return self.trainer.max_steps
return max_estimated_steps

View File

@ -0,0 +1,571 @@
from logging import debug
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
# Hide lines below until Lab 5
import wandb
import numpy as np
# Hide lines above until Lab 5
from .base import BaseLitModel
from .util import dialog_f1_eval, f1_eval, compute_f1, acc, f1_score
from transformers.optimization import get_linear_schedule_with_warmup
from functools import partial
def mask_hook(grad_input, st, ed):
mask = torch.zeros((grad_input.shape[0], 1)).type_as(grad_input)
mask[st: ed] += 1.0 # 只优化id为18的token
# for the speaker unused token12
mask[1:3] += 1.0
return grad_input * mask
def multilabel_categorical_crossentropy(y_pred, y_true):
y_pred = (1 - 2 * y_true) * y_pred
y_pred_neg = y_pred - y_true * 1e12
y_pred_pos = y_pred - (1 - y_true) * 1e12
zeros = torch.zeros_like(y_pred[..., :1])
y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1)
y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1)
neg_loss = torch.logsumexp(y_pred_neg, dim=-1)
pos_loss = torch.logsumexp(y_pred_pos, dim=-1)
return (neg_loss + pos_loss).mean()
class BertLitModel(BaseLitModel):
"""
use AutoModelForMaskedLM, and select the output by another layer in the lit model
"""
def __init__(self, model, args, tokenizer):
super().__init__(model, args)
self.tokenizer = tokenizer
with open(f"{args.data_dir}/rel2id.json","r") as file:
rel2id = json.load(file)
Na_num = 0
for k, v in rel2id.items():
if k == "NA" or k == "no_relation" or k == "Other":
Na_num = v
break
num_relation = len(rel2id)
# init loss function
self.loss_fn = multilabel_categorical_crossentropy if "dialogue" in args.data_dir else nn.CrossEntropyLoss()
# ignore the no_relation class to compute the f1 score
self.eval_fn = f1_eval if "dialogue" in args.data_dir else partial(f1_score, rel_num=num_relation, na_num=Na_num)
self.best_f1 = 0
self.t_lambda = args.t_lambda
self.label_st_id = tokenizer("[class1]", add_special_tokens=False)['input_ids'][0]
self._init_label_word()
def _init_label_word(self):
args = self.args
# ./dataset/dataset_name
dataset_name = args.data_dir.split("/")[1]
model_name_or_path = args.model_name_or_path.split("/")[-1]
label_path = f"./dataset/{model_name_or_path}_{dataset_name}.pt"
# [num_labels, num_tokens], ignore the unanswerable
if "dialogue" in args.data_dir:
label_word_idx = torch.load(label_path)[:-1]
else:
label_word_idx = torch.load(label_path)
num_labels = len(label_word_idx)
self.cur_model.resize_token_embeddings(len(self.tokenizer))
with torch.no_grad():
word_embeddings = self.cur_model.get_input_embeddings()
continous_label_word = [a[0] for a in self.tokenizer([f"[class{i}]" for i in range(1, num_labels+1)], add_special_tokens=False)['input_ids']]
for i, idx in enumerate(label_word_idx):
word_embeddings.weight[continous_label_word[i]] = torch.mean(word_embeddings.weight[idx], dim=0)
# word_embeddings.weight[continous_label_word[i]] = self.relation_embedding[i]
so_word = [a[0] for a in self.tokenizer(["[obj]","[sub]"], add_special_tokens=False)['input_ids']]
meaning_word = [a[0] for a in self.tokenizer(["person","organization", "location", "date", "country"], add_special_tokens=False)['input_ids']]
for i, idx in enumerate(so_word):
word_embeddings.weight[so_word[i]] = torch.mean(word_embeddings.weight[meaning_word], dim=0)
assert torch.equal(self.cur_model.get_input_embeddings().weight, word_embeddings.weight)
assert torch.equal(self.cur_model.get_input_embeddings().weight, self.cur_model.get_output_embeddings().weight)
self.word2label = continous_label_word # a continous list
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids , labels, so = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
so = so.to(self.device)
result = self.model(input_ids, attention_mask, token_type_ids, return_dict=True, output_hidden_states=True)
logits = result.logits
output_embedding = result.hidden_states[-1]
logits = self.pvp(logits, input_ids)
loss = self.loss_fn(logits, labels) + self.t_lambda * self.ke_loss(output_embedding, labels, so)
#print("Train/loss: ", loss)
return loss
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids , labels, _ = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask, token_type_ids, return_dict=True).logits
logits = self.pvp(logits, input_ids)
loss = self.loss_fn(logits, labels)
#print("Eval/loss: ", loss)
return {"loss": loss, "eval_logits": logits.detach().cpu().numpy(), "eval_labels": labels.detach().cpu().numpy()}
def validation_epoch_end(self, outputs):
logits = np.concatenate([o["eval_logits"] for o in outputs])
labels = np.concatenate([o["eval_labels"] for o in outputs])
f1 = self.eval_fn(logits, labels)['f1']
#print("Eval/f1: ", f1)
best_f1 = -1
if f1 > self.best_f1:
self.best_f1 = f1
best_f1 = self.best_f1
#print("Eval/best_f1: ", self.best_f1)
return f1, best_f1, self.best_f1
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids , labels, _ = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask, token_type_ids, return_dict=True).logits
logits = self.pvp(logits, input_ids)
return {"test_logits": logits.detach().cpu().numpy(), "test_labels": labels.detach().cpu().numpy()}
def test_epoch_end(self, outputs):
logits = np.concatenate([o["test_logits"] for o in outputs])
labels = np.concatenate([o["test_labels"] for o in outputs])
f1 = self.eval_fn(logits, labels)['f1']
#print("Test/f1: ", f1)
return f1
@staticmethod
def add_to_argparse(parser):
BaseLitModel.add_to_argparse(parser)
parser.add_argument("--t_lambda", type=float, default=0.01, help="")
return parser
def pvp(self, logits, input_ids):
# convert the [batch_size, seq_len, vocab_size] => [batch_size, num_labels]
#! hard coded
_, mask_idx = (input_ids == 103).nonzero(as_tuple=True)
bs = input_ids.shape[0]
mask_output = logits[torch.arange(bs), mask_idx]
assert mask_idx.shape[0] == bs, "only one mask in sequence!"
final_output = mask_output[:,self.word2label]
return final_output
def ke_loss(self, logits, labels, so):
subject_embedding = []
object_embedding = []
bsz = logits.shape[0]
for i in range(bsz):
subject_embedding.append(torch.mean(logits[i, so[i][0]:so[i][1]], dim=0))
object_embedding.append(torch.mean(logits[i, so[i][2]:so[i][3]], dim=0))
subject_embedding = torch.stack(subject_embedding)
object_embedding = torch.stack(object_embedding)
# trick , the relation ids is concated,
relation_embedding = self.cur_model.get_output_embeddings().weight[labels+self.label_st_id]
loss = torch.norm(subject_embedding + relation_embedding - object_embedding, p=2)
return loss
def configure_optimizers(self):
no_decay_param = ["bias", "LayerNorm.weight"]
if not self.args.two_steps:
parameters = self.cur_model.named_parameters()
else:
# cur_model.bert.embeddings.weight
parameters = [next(self.cur_model.named_parameters())]
# only optimize the embedding parameters
optimizer_group_parameters = [
{"params": [p for n, p in parameters if not any(nd in n for nd in no_decay_param)], "weight_decay": self.args.weight_decay},
{"params": [p for n, p in parameters if any(nd in n for nd in no_decay_param)], "weight_decay": 0}
]
optimizer = self.optimizer_class(optimizer_group_parameters, lr=self.lr, eps=1e-8)
return optimizer
'''return {
"optimizer": optimizer,
"lr_scheduler":{
'scheduler': scheduler,
'interval': 'step', # or 'epoch'
'frequency': 1,
}
}'''
class dialog_BertLitModel(BaseLitModel):
"""
use AutoModelForMaskedLM, and select the output by another layer in the lit model
"""
def __init__(self, model, args, tokenizer, device):
super().__init__(model, args)
self.tokenizer = tokenizer
self.device = device
with open(f"{args.data_dir}/rel2id.json", "r") as file:
rel2id = json.load(file)
Na_num = 0
for k, v in rel2id.items():
if k == "NA" or k == "no_relation" or k == "Other":
Na_num = v
break
num_relation = len(rel2id)
# init loss function
self.loss_fn = multilabel_categorical_crossentropy if "dialogue" in args.data_dir else nn.CrossEntropyLoss()
# ignore the no_relation class to compute the f1 score
self.eval_fn = dialog_f1_eval if "dialogue" in args.data_dir else partial(f1_score, rel_num=num_relation,
na_num=Na_num)
self.best_f1 = 0
self.t_lambda = args.t_lambda
self.label_st_id = tokenizer("[class1]", add_special_tokens=False)['input_ids'][0]
self._init_label_word()
def _init_label_word(self):
args = self.args
# ./dataset/dataset_name
dataset_name = args.data_dir.split("/")[1]
model_name_or_path = args.model_name_or_path.split("/")[-1]
label_path = f"./dataset/{model_name_or_path}_{dataset_name}.pt"
# [num_labels, num_tokens], ignore the unanswerable
if "dialogue" in args.data_dir:
label_word_idx = torch.load(label_path)[:-1]
else:
label_word_idx = torch.load(label_path)
num_labels = len(label_word_idx)
#print(len(self.tokenizer))
self.cur_model.resize_token_embeddings(len(self.tokenizer))
with torch.no_grad():
word_embeddings = self.cur_model.get_input_embeddings()
continous_label_word = [a[0] for a in self.tokenizer([f"[class{i}]" for i in range(1, num_labels + 1)],
add_special_tokens=False)['input_ids']]
for i, idx in enumerate(label_word_idx):
word_embeddings.weight[continous_label_word[i]] = torch.mean(word_embeddings.weight[idx], dim=0)
# word_embeddings.weight[continous_label_word[i]] = self.relation_embedding[i]
so_word = [a[0] for a in self.tokenizer(["[obj]", "[sub]"], add_special_tokens=False)['input_ids']]
meaning_word = [a[0] for a in self.tokenizer(["person", "organization", "location", "date", "country"],
add_special_tokens=False)['input_ids']]
for i, idx in enumerate(so_word):
word_embeddings.weight[so_word[i]] = torch.mean(word_embeddings.weight[meaning_word], dim=0)
assert torch.equal(self.cur_model.get_input_embeddings().weight, word_embeddings.weight)
assert torch.equal(self.cur_model.get_input_embeddings().weight, self.cur_model.get_output_embeddings().weight)
self.word2label = continous_label_word # a continous list
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids, labels, so = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
so = so.to(self.device)
result = self.model(input_ids, attention_mask, token_type_ids, return_dict=True, output_hidden_states=True)
logits = result.logits
output_embedding = result.hidden_states[-1]
logits = self.pvp(logits, input_ids)
loss = self.loss_fn(logits, labels) + self.t_lambda * self.ke_loss(output_embedding, labels, so)
#print("Train/loss: ", loss)
return loss
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids, labels, _ = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask, token_type_ids, return_dict=True).logits
logits = self.pvp(logits, input_ids)
loss = self.loss_fn(logits, labels)
#print("Eval/loss: ", loss)
return {"loss": loss, "eval_logits": logits.detach().cpu().numpy(), "eval_labels": labels.detach().cpu().numpy()}
def validation_epoch_end(self, outputs) -> None:
logits = np.concatenate([o["eval_logits"] for o in outputs])
labels = np.concatenate([o["eval_labels"] for o in outputs])
f1 = self.eval_fn(logits, labels)['f1']
#print("Eval/f1: ", f1)
best_f1 = -1
if f1 > self.best_f1:
self.best_f1 = f1
best_f1 = self.best_f1
#print("Eval/best_f1: ", self.best_f1)
return f1, best_f1, self.best_f1
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids, labels, _ = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask, token_type_ids, return_dict=True).logits
logits = self.pvp(logits, input_ids)
return {"test_logits": logits.detach().cpu().numpy(), "test_labels": labels.detach().cpu().numpy()}
def test_epoch_end(self, outputs):
logits = np.concatenate([o["test_logits"] for o in outputs])
labels = np.concatenate([o["test_labels"] for o in outputs])
f1 = self.eval_fn(logits, labels)['f1']
#print("Test/f1: ", f1)
return f1
@staticmethod
def add_to_argparse(parser):
BaseLitModel.add_to_argparse(parser)
parser.add_argument("--t_lambda", type=float, default=0.01, help="")
return parser
def pvp(self, logits, input_ids):
# convert the [batch_size, seq_len, vocab_size] => [batch_size, num_labels]
# ! hard coded
_, mask_idx = (input_ids == 103).nonzero(as_tuple=True)
bs = input_ids.shape[0]
mask_output = logits[torch.arange(bs), mask_idx]
assert mask_idx.shape[0] == bs, "only one mask in sequence!"
final_output = mask_output[:, self.word2label]
return final_output
def ke_loss(self, logits, labels, so):
subject_embedding = []
object_embedding = []
bsz = logits.shape[0]
for i in range(bsz):
subject_embedding.append(torch.mean(logits[i, so[i][0]:so[i][1]], dim=0))
object_embedding.append(torch.mean(logits[i, so[i][2]:so[i][3]], dim=0))
subject_embedding = torch.stack(subject_embedding)
object_embedding = torch.stack(object_embedding)
# trick , the relation ids is concated,
relation_embedding = self.cur_model.get_output_embeddings().weight[labels + self.label_st_id]
loss = torch.norm(subject_embedding + relation_embedding - object_embedding, p=2)
return loss
def configure_optimizers(self):
no_decay_param = ["bias", "LayerNorm.weight"]
if not self.args.two_steps:
parameters = self.cur_model.named_parameters()
else:
# cur_model.bert.embeddings.weight
parameters = [next(self.cur_model.named_parameters())]
# only optimize the embedding parameters
optimizer_group_parameters = [
{"params": [p for n, p in parameters if not any(nd in n for nd in no_decay_param)],
"weight_decay": self.args.weight_decay},
{"params": [p for n, p in parameters if any(nd in n for nd in no_decay_param)], "weight_decay": 0}
]
optimizer = self.optimizer_class(optimizer_group_parameters, lr=self.lr, eps=1e-8)
return optimizer
'''return {
"optimizer": optimizer,
"lr_scheduler": {
'scheduler': scheduler,
'interval': 'step', # or 'epoch'
'frequency': 1,
}
}'''
class TransformerLitModelTwoSteps(BertLitModel):
def configure_optimizers(self):
no_decay_param = ["bais", "LayerNorm.weight"]
optimizer_group_parameters = [
{"params": [p for n, p in self.cur_model.named_parameters() if not any(nd in n for nd in no_decay_param)], "weight_decay": self.args.weight_decay},
{"params": [p for n, p in self.cur_model.named_parameters() if any(nd in n for nd in no_decay_param)], "weight_decay": 0}
]
optimizer = self.optimizer_class(optimizer_group_parameters, lr=self.args.lr_2, eps=1e-8)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.num_training_steps * 0.1, num_training_steps=self.num_training_steps)
return optimizer
'''return {
"optimizer": optimizer,
"lr_scheduler":{
'scheduler': scheduler,
'interval': 'step', # or 'epoch'
'frequency': 1,
}
}'''
class DialogueLitModel(dialog_BertLitModel):
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids, labels = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
result = self.model(input_ids, attention_mask, token_type_ids, return_dict=True, output_hidden_states=True)
logits = result.logits
logits = self.pvp(logits, input_ids)
loss = self.loss_fn(logits, labels)
#print("Train/loss: ", loss)
return loss
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids, labels = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask, token_type_ids, return_dict=True).logits
logits = self.pvp(logits, input_ids)
loss = self.loss_fn(logits, labels)
#print("Eval/loss: ", loss)
return {"loss": loss, "eval_logits": logits.detach().cpu().numpy(), "eval_labels": labels.detach().cpu().numpy()}
def validation_epoch_end(self, outputs) -> None:
logits = np.concatenate([o["eval_logits"] for o in outputs])
labels = np.concatenate([o["eval_labels"] for o in outputs])
f1 = self.eval_fn(logits, labels)['f1']
#print("Eval/f1: ", f1)
best_f1 = -1
if f1 > self.best_f1:
self.best_f1 = f1
best_f1 = self.best_f1
#print("Eval/best_f1: ", self.best_f1)
return f1, best_f1, self.best_f1
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, token_type_ids, labels = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
token_type_ids = token_type_ids.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask, token_type_ids, return_dict=True).logits
logits = self.pvp(logits, input_ids)
return {"test_logits": logits.detach().cpu().numpy(), "test_labels": labels.detach().cpu().numpy()}
def test_epoch_end(self, outputs):
logits = np.concatenate([o["test_logits"] for o in outputs])
labels = np.concatenate([o["test_labels"] for o in outputs])
f1 = self.eval_fn(logits, labels)['f1']
#print("Test/f1: ", f1)
return f1
@staticmethod
def add_to_argparse(parser):
BaseLitModel.add_to_argparse(parser)
parser.add_argument("--t_lambda", type=float, default=0.01, help="")
return parser
def pvp(self, logits, input_ids):
# convert the [batch_size, seq_len, vocab_size] => [batch_size, num_labels]
# ! hard coded
_, mask_idx = (input_ids == 103).nonzero(as_tuple=True)
bs = input_ids.shape[0]
mask_output = logits[torch.arange(bs), mask_idx]
assert mask_idx.shape[0] == bs, "only one mask in sequence!"
final_output = mask_output[:, self.word2label]
return final_output
class GPTLitModel(BaseLitModel):
def __init__(self, model, args , data_config):
super().__init__(model, args)
# self.num_training_steps = data_config["num_training_steps"]
self.loss_fn = nn.CrossEntropyLoss()
# self.loss_fn = multilabel_categorical_crossentropy
self.best_f1 = 0
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, cls_idx , labels = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
cls_idx = cls_idx.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask=attention_mask, mc_token_ids=cls_idx)
if not isinstance(logits, torch.Tensor):
logits = logits.mc_logits
loss = self.loss_fn(logits, labels)
#print("Train/loss: ", loss)
return loss
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
input_ids, attention_mask, cls_idx , labels = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
cls_idx = cls_idx.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask=attention_mask, mc_token_ids=cls_idx)
if not isinstance(logits, torch.Tensor):
logits = logits.mc_logits
loss = self.loss_fn(logits, labels)
#print("Eval/loss: ", loss)
return {"loss": loss, "eval_logits": logits.detach().cpu().numpy(), "eval_labels": labels.detach().cpu().numpy()}
def validation_epoch_end(self, outputs) -> None:
logits = np.concatenate([o["eval_logits"] for o in outputs])
labels = np.concatenate([o["eval_labels"] for o in outputs])
# f1 = compute_f1(logits, labels)["f1"]
f1 = f1_score(logits, labels)
#print("Eval/f1: ", f1)
best_f1 = -1
if f1 > self.best_f1:
self.best_f1 = f1
best_f1 = self.best_f1
#print("Eval/best_f1: ", self.best_f1)
return f1, best_f1, self.best_f1
def test_step(self, batch, batch_idx): # pylint: disable=unused-argumenT
input_ids, attention_mask, cls_idx , labels = batch
input_ids = input_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
cls_idx = cls_idx.to(self.device)
labels = labels.to(self.device)
logits = self.model(input_ids, attention_mask=attention_mask, mc_token_ids=cls_idx)
if not isinstance(logits, torch.Tensor):
logits = logits.mc_logits
return {"test_logits": logits.detach().cpu().numpy(), "test_labels": labels.detach().cpu().numpy()}
def test_epoch_end(self, outputs):
logits = np.concatenate([o["test_logits"] for o in outputs])
labels = np.concatenate([o["test_labels"] for o in outputs])
f1 = f1_score(logits, labels)
# f1 = acc(logits, labels)
#print("Test/f1: ", f1)
return f1

View File

@ -0,0 +1,228 @@
import numpy as np
def dialog_f1_eval(logits, labels):
def getpred(result, T1=0.5, T2=0.4):
# 使用阈值得到preds, result = logits
# T2 表示如果都低于T2 那么就是 no relation, 否则选取一个最大的
ret = []
for i in range(len(result)):
r = []
maxl, maxj = -1, -1
for j in range(len(result[i])):
if result[i][j] > T1:
r += [j]
if result[i][j] > maxl:
maxl = result[i][j]
maxj = j
if len(r) == 0:
if maxl <= T2:
r = [36]
else:
r += [maxj]
ret.append(r)
return ret
def geteval(devp, data):
correct_sys, all_sys = 0, 0
correct_gt = 0
for i in range(len(data)):
# 每一个样本 都是[1,4,...,20] 表示有1,4,20 是1 如果没有就是[36]
for id in data[i]:
if id != 36:
# 标签中 1 的个数
correct_gt += 1
if id in devp[i]:
# 预测正确
correct_sys += 1
for id in devp[i]:
if id != 36:
all_sys += 1
precision = 1 if all_sys == 0 else correct_sys / all_sys
recall = 0 if correct_gt == 0 else correct_sys / correct_gt
f_1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0
return f_1
logits = np.asarray(logits)
logits = list(1 / (1 + np.exp(-logits)))
temp_labels = []
for l in labels:
t = []
for i in range(36):
if l[i] == 1:
t += [i]
if len(t) == 0:
t = [36]
temp_labels.append(t)
assert (len(labels) == len(logits))
labels = temp_labels
bestT2 = bestf_1 = 0
for T2 in range(51):
devp = getpred(logits, T2=T2 / 100.)
f_1 = geteval(devp, labels)
if f_1 > bestf_1:
bestf_1 = f_1
bestT2 = T2 / 100.
return dict(f1=bestf_1, T2=bestT2)
def f1_eval(logits, labels):
def getpred(result, T1 = 0.5, T2 = 0.4) :
# 使用阈值得到preds, result = logits
# T2 表示如果都低于T2 那么就是 no relation, 否则选取一个最大的
ret = []
for i in range(len(result)):
r = []
maxl, maxj = -1, -1
for j in range(len(result[i])):
if result[i][j] > T1:
r += [j]
if result[i][j] > maxl:
maxl = result[i][j]
maxj = j
if len(r) == 0:
if maxl <= T2:
r = [36]
else:
r += [maxj]
ret.append(r)
return ret
def geteval(devp, data):
correct_sys, all_sys = 0, 0
correct_gt = 0
for i in range(len(data)):
# 每一个样本 都是[1,4,...,20] 表示有1,4,20 是1 如果没有就是[36]
for id in data[i]:
if id != 36:
# 标签中 1 的个数
correct_gt += 1
if id in devp[i]:
# 预测正确
correct_sys += 1
for id in devp[i]:
if id != 36:
all_sys += 1
precision = 1 if all_sys == 0 else correct_sys/all_sys
recall = 0 if correct_gt == 0 else correct_sys/correct_gt
f_1 = 2*precision*recall/(precision+recall) if precision+recall != 0 else 0
return f_1
logits = np.asarray(logits)
logits = list(1 / (1 + np.exp(-logits)))
temp_labels = []
for l in labels:
t = []
for i in range(36):
if l[i] == 1:
t += [i]
if len(t) == 0:
t = [36]
temp_labels.append(t)
assert(len(labels) == len(logits))
labels = temp_labels
bestT2 = bestf_1 = 0
for T2 in range(51):
devp = getpred(logits, T2=T2/100.)
f_1 = geteval(devp, labels)
if f_1 > bestf_1:
bestf_1 = f_1
bestT2 = T2/100.
return bestf_1, bestT2
def compute_f1(logits, labels):
n_gold = n_pred = n_correct = 0
preds = np.argmax(logits, axis=-1)
for pred, label in zip(preds, labels):
if pred != 0:
n_pred += 1
if label != 0:
n_gold += 1
if pred != 0 and label != 0 and (pred == label):
n_correct += 1
if n_correct == 0:
return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
else:
prec = n_correct * 1.0 / n_pred
recall = n_correct * 1.0 / n_gold
if prec + recall > 0:
f1 = 2.0 * prec * recall / (prec + recall)
else:
f1 = 0.0
return {'precision': prec, 'recall': recall, 'f1': f1}
def acc(logits, labels):
preds = np.argmax(logits, axis=-1)
return (preds == labels).mean()
from collections import Counter
def f1_score(output, label, rel_num=42, na_num=13):
correct_by_relation = Counter()
guess_by_relation = Counter()
gold_by_relation = Counter()
output = np.argmax(output, axis=-1)
for i in range(len(output)):
guess = output[i]
gold = label[i]
if guess == na_num:
guess = 0
elif guess < na_num:
guess += 1
if gold == na_num:
gold = 0
elif gold < na_num:
gold += 1
if gold == 0 and guess == 0:
continue
if gold == 0 and guess != 0:
guess_by_relation[guess] += 1
if gold != 0 and guess == 0:
gold_by_relation[gold] += 1
if gold != 0 and guess != 0:
guess_by_relation[guess] += 1
gold_by_relation[gold] += 1
if gold == guess:
correct_by_relation[gold] += 1
f1_by_relation = Counter()
recall_by_relation = Counter()
prec_by_relation = Counter()
for i in range(1, rel_num):
recall = 0
if gold_by_relation[i] > 0:
recall = correct_by_relation[i] / gold_by_relation[i]
precision = 0
if guess_by_relation[i] > 0:
precision = correct_by_relation[i] / guess_by_relation[i]
if recall + precision > 0 :
f1_by_relation[i] = 2 * recall * precision / (recall + precision)
recall_by_relation[i] = recall
prec_by_relation[i] = precision
micro_f1 = 0
if sum(guess_by_relation.values()) != 0 and sum(correct_by_relation.values()) != 0:
recall = sum(correct_by_relation.values()) / sum(gold_by_relation.values())
prec = sum(correct_by_relation.values()) / sum(guess_by_relation.values())
micro_f1 = 2 * recall * prec / (recall+prec)
return dict(f1=micro_f1)

View File

@ -0,0 +1 @@
# this is an empty file

View File

@ -0,0 +1,7 @@
from .bert import BertForSequenceClassification
from .bert.modeling_bert import BertGetLabelWord, BertUseLabelWord, BertDecouple, BertForMaskedLM
from .gpt2 import GPT2DoubleHeadsModel
from .gpt2.modeling_gpt2 import GPT2UseLabelWord
from .roberta import RobertaForSequenceClassification
from .roberta.modeling_roberta import RobertaUseLabelWord

View File

@ -0,0 +1,155 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from transformers.file_utils import (
_BaseLazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
from .modeling_bert import BertGetLabelWord, BertUseLabelWord
_import_structure = {
"configuration_bert": ["BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BertConfig"],
"tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"],
}
if is_tokenizers_available():
_import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"]
if is_torch_available():
_import_structure["modeling_bert"] = [
"BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"BertForMaskedLM",
"BertForMultipleChoice",
"BertForNextSentencePrediction",
"BertForPreTraining",
"BertForQuestionAnswering",
"BertForSequenceClassification",
"BertForTokenClassification",
"BertLayer",
"BertLMHeadModel",
"BertModel",
"BertPreTrainedModel",
"load_tf_weights_in_bert",
]
if is_tf_available():
_import_structure["modeling_tf_bert"] = [
"TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFBertEmbeddings",
"TFBertForMaskedLM",
"TFBertForMultipleChoice",
"TFBertForNextSentencePrediction",
"TFBertForPreTraining",
"TFBertForQuestionAnswering",
"TFBertForSequenceClassification",
"TFBertForTokenClassification",
"TFBertLMHeadModel",
"TFBertMainLayer",
"TFBertModel",
"TFBertPreTrainedModel",
]
if is_flax_available():
_import_structure["modeling_flax_bert"] = [
"FlaxBertForMaskedLM",
"FlaxBertForMultipleChoice",
"FlaxBertForNextSentencePrediction",
"FlaxBertForPreTraining",
"FlaxBertForQuestionAnswering",
"FlaxBertForSequenceClassification",
"FlaxBertForTokenClassification",
"FlaxBertModel",
"FlaxBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
if is_tokenizers_available():
from .tokenization_bert_fast import BertTokenizerFast
if is_torch_available():
from .modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
BertForMaskedLM,
BertForMultipleChoice,
BertForNextSentencePrediction,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertLayer,
BertLMHeadModel,
BertModel,
BertPreTrainedModel,
load_tf_weights_in_bert,
)
if is_tf_available():
from .modeling_tf_bert import (
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFBertEmbeddings,
TFBertForMaskedLM,
TFBertForMultipleChoice,
TFBertForNextSentencePrediction,
TFBertForPreTraining,
TFBertForQuestionAnswering,
TFBertForSequenceClassification,
TFBertForTokenClassification,
TFBertLMHeadModel,
TFBertMainLayer,
TFBertModel,
TFBertPreTrainedModel,
)
if is_flax_available():
from .modeling_flax_bert import (
FlaxBertForMaskedLM,
FlaxBertForMultipleChoice,
FlaxBertForNextSentencePrediction,
FlaxBertForPreTraining,
FlaxBertForQuestionAnswering,
FlaxBertForSequenceClassification,
FlaxBertForTokenClassification,
FlaxBertModel,
FlaxBertPreTrainedModel,
)
else:
import importlib
import os
import sys
class _LazyModule(_BaseLazyModule):
"""
Module class that surfaces all objects but only performs associated imports when the objects are requested.
"""
__file__ = globals()["__file__"]
__path__ = [os.path.dirname(__file__)]
def _get_module(self, module_name: str):
return importlib.import_module("." + module_name, self.__name__)
sys.modules[__name__] = _LazyModule(__name__, _import_structure)

View File

@ -0,0 +1,156 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
"bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
"bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
"bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
"bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
"bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
"bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
"bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
"bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
"bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
"bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
"bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
"bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
"cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
"TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
"TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
"wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
# See all BERT models at https://huggingface.co/models?filter=bert
}
class BertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
:class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
:class:`~transformers.TFBertModel`.
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
:class:`~transformers.TFBertModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``.
Examples::
>>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "bert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
gradient_checkpointing=False,
position_embedding_type="absolute",
use_cache=True,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.gradient_checkpointing = gradient_checkpointing
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,558 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Bert."""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
"bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
"bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt",
"bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
"bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
"bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
"bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt",
"bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
"bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt",
"bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
"bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt",
"TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt",
"TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt",
"wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"bert-base-uncased": 512,
"bert-large-uncased": 512,
"bert-base-cased": 512,
"bert-large-cased": 512,
"bert-base-multilingual-uncased": 512,
"bert-base-multilingual-cased": 512,
"bert-base-chinese": 512,
"bert-base-german-cased": 512,
"bert-large-uncased-whole-word-masking": 512,
"bert-large-cased-whole-word-masking": 512,
"bert-large-uncased-whole-word-masking-finetuned-squad": 512,
"bert-large-cased-whole-word-masking-finetuned-squad": 512,
"bert-base-cased-finetuned-mrpc": 512,
"bert-base-german-dbmdz-cased": 512,
"bert-base-german-dbmdz-uncased": 512,
"TurkuNLP/bert-base-finnish-cased-v1": 512,
"TurkuNLP/bert-base-finnish-uncased-v1": 512,
"wietsedv/bert-base-dutch-cased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"bert-base-uncased": {"do_lower_case": True},
"bert-large-uncased": {"do_lower_case": True},
"bert-base-cased": {"do_lower_case": False},
"bert-large-cased": {"do_lower_case": False},
"bert-base-multilingual-uncased": {"do_lower_case": True},
"bert-base-multilingual-cased": {"do_lower_case": False},
"bert-base-chinese": {"do_lower_case": False},
"bert-base-german-cased": {"do_lower_case": False},
"bert-large-uncased-whole-word-masking": {"do_lower_case": True},
"bert-large-cased-whole-word-masking": {"do_lower_case": False},
"bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
"bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
"bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
"bert-base-german-dbmdz-cased": {"do_lower_case": False},
"bert-base-german-dbmdz-uncased": {"do_lower_case": True},
"TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
"TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
"wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BertTokenizer(PreTrainedTokenizer):
r"""
Construct a BERT tokenizer. Based on WordPiece.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
File containing the vocabulary.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to do basic tokenization before WordPiece.
never_split (:obj:`Iterable`, `optional`):
Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this `issue
<https://github.com/huggingface/transformers/issues/328>`__).
strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs
):
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
# If the token is part of the never_split set
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
never_split (:obj:`Iterable`, `optional`):
Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this `issue
<https://github.com/huggingface/transformers/issues/328>`__).
strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT).
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args:
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
:func:`PreTrainedTokenizer.tokenize`) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens

View File

@ -0,0 +1,140 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...file_utils import (
_BaseLazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig"],
"tokenization_electra": ["ElectraTokenizer"],
}
if is_tokenizers_available():
_import_structure["tokenization_electra_fast"] = ["ElectraTokenizerFast"]
if is_torch_available():
_import_structure["modeling_electra"] = [
"ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
"ElectraForMaskedLM",
"ElectraForMultipleChoice",
"ElectraForPreTraining",
"ElectraForQuestionAnswering",
"ElectraForSequenceClassification",
"ElectraForTokenClassification",
"ElectraModel",
"ElectraPreTrainedModel",
"load_tf_weights_in_electra",
]
if is_tf_available():
_import_structure["modeling_tf_electra"] = [
"TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFElectraForMaskedLM",
"TFElectraForMultipleChoice",
"TFElectraForPreTraining",
"TFElectraForQuestionAnswering",
"TFElectraForSequenceClassification",
"TFElectraForTokenClassification",
"TFElectraModel",
"TFElectraPreTrainedModel",
]
if is_flax_available():
_import_structure["modeling_flax_electra"] = [
"FlaxElectraForMaskedLM",
"FlaxElectraForMultipleChoice",
"FlaxElectraForPreTraining",
"FlaxElectraForQuestionAnswering",
"FlaxElectraForSequenceClassification",
"FlaxElectraForTokenClassification",
"FlaxElectraModel",
"FlaxElectraPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
from .tokenization_electra import ElectraTokenizer
if is_tokenizers_available():
from .tokenization_electra_fast import ElectraTokenizerFast
if is_torch_available():
from .modeling_electra import (
ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
ElectraForMaskedLM,
ElectraForMultipleChoice,
ElectraForPreTraining,
ElectraForQuestionAnswering,
ElectraForSequenceClassification,
ElectraForTokenClassification,
ElectraModel,
ElectraPreTrainedModel,
load_tf_weights_in_electra,
)
if is_tf_available():
from .modeling_tf_electra import (
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFElectraForMaskedLM,
TFElectraForMultipleChoice,
TFElectraForPreTraining,
TFElectraForQuestionAnswering,
TFElectraForSequenceClassification,
TFElectraForTokenClassification,
TFElectraModel,
TFElectraPreTrainedModel,
)
if is_flax_available():
from .modeling_flax_electra import (
FlaxElectraForMaskedLM,
FlaxElectraForMultipleChoice,
FlaxElectraForPreTraining,
FlaxElectraForQuestionAnswering,
FlaxElectraForSequenceClassification,
FlaxElectraForTokenClassification,
FlaxElectraModel,
FlaxElectraPreTrainedModel,
)
else:
import importlib
import os
import sys
class _LazyModule(_BaseLazyModule):
"""
Module class that surfaces all objects but only performs associated imports when the objects are requested.
"""
__file__ = globals()["__file__"]
__path__ = [os.path.dirname(__file__)]
def _get_module(self, module_name: str):
return importlib.import_module("." + module_name, self.__name__)
sys.modules[__name__] = _LazyModule(__name__, _import_structure)

View File

@ -0,0 +1,166 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" ELECTRA model configuration """
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/config.json",
"google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/config.json",
"google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/config.json",
"google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json",
"google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/config.json",
"google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/config.json",
}
class ElectraConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel` or a
:class:`~transformers.TFElectraModel`. It is used to instantiate a ELECTRA model according to the specified
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
configuration to that of the ELECTRA `google/electra-small-discriminator
<https://huggingface.co/google/electra-small-discriminator>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.ElectraModel` or
:class:`~transformers.TFElectraModel`.
embedding_size (:obj:`int`, `optional`, defaults to 128):
Dimensionality of the encoder layers and the pooler layer.
hidden_size (:obj:`int`, `optional`, defaults to 256):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ElectraModel` or
:class:`~transformers.TFElectraModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
summary_type (:obj:`str`, `optional`, defaults to :obj:`"first"`):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Has to be one of the following options:
- :obj:`"last"`: Take the last token hidden state (like XLNet).
- :obj:`"first"`: Take the first token hidden state (like BERT).
- :obj:`"mean"`: Take the mean of all tokens hidden states.
- :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- :obj:`"attn"`: Not implemented now, use multi-head attention.
summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Whether or not to add a projection after the vector extraction.
summary_activation (:obj:`str`, `optional`):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
Pass :obj:`"gelu"` for a gelu activation to the output, any other value will result in no activation.
summary_last_dropout (:obj:`float`, `optional`, defaults to 0.0):
Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
The dropout ratio to be used after the projection and activation.
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__.
Examples::
>>> from transformers import ElectraModel, ElectraConfig
>>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig()
>>> # Initializing a model from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "electra"
def __init__(
self,
vocab_size=30522,
embedding_size=128,
hidden_size=256,
num_hidden_layers=12,
num_attention_heads=4,
intermediate_size=1024,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
summary_type="first",
summary_use_proj=True,
summary_activation="gelu",
summary_last_dropout=0.1,
pad_token_id=0,
position_embedding_type="absolute",
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.position_embedding_type = position_embedding_type

View File

@ -0,0 +1,79 @@
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert ELECTRA checkpoint."""
import argparse
import torch
from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
# Initialise PyTorch model
config = ElectraConfig.from_json_file(config_file)
print(f"Building PyTorch model from configuration: {config}")
if discriminator_or_generator == "discriminator":
model = ElectraForPreTraining(config)
elif discriminator_or_generator == "generator":
model = ElectraForMaskedLM(config)
else:
raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
# Load weights from tf checkpoint
load_tf_weights_in_electra(
model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
)
# Save pytorch-model
print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained model. \n"
"This specifies the model architecture.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--discriminator_or_generator",
default=None,
type=str,
required=True,
help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
"'generator'.",
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(
args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,66 @@
# coding=utf-8
# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..bert.tokenization_bert import BertTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/vocab.txt",
"google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/vocab.txt",
"google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/vocab.txt",
"google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt",
"google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/vocab.txt",
"google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/electra-small-generator": 512,
"google/electra-base-generator": 512,
"google/electra-large-generator": 512,
"google/electra-small-discriminator": 512,
"google/electra-base-discriminator": 512,
"google/electra-large-discriminator": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"google/electra-small-generator": {"do_lower_case": True},
"google/electra-base-generator": {"do_lower_case": True},
"google/electra-large-generator": {"do_lower_case": True},
"google/electra-small-discriminator": {"do_lower_case": True},
"google/electra-base-discriminator": {"do_lower_case": True},
"google/electra-large-discriminator": {"do_lower_case": True},
}
class ElectraTokenizer(BertTokenizer):
r"""
Construct an ELECTRA tokenizer.
:class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION

View File

@ -0,0 +1,75 @@
# coding=utf-8
# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..bert.tokenization_bert_fast import BertTokenizerFast
from .tokenization_electra import ElectraTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/vocab.txt",
"google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/vocab.txt",
"google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/vocab.txt",
"google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt",
"google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/vocab.txt",
"google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt",
},
"tokenizer_file": {
"google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/tokenizer.json",
"google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/tokenizer.json",
"google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/tokenizer.json",
"google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/tokenizer.json",
"google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/tokenizer.json",
"google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/electra-small-generator": 512,
"google/electra-base-generator": 512,
"google/electra-large-generator": 512,
"google/electra-small-discriminator": 512,
"google/electra-base-discriminator": 512,
"google/electra-large-discriminator": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"google/electra-small-generator": {"do_lower_case": True},
"google/electra-base-generator": {"do_lower_case": True},
"google/electra-large-generator": {"do_lower_case": True},
"google/electra-small-discriminator": {"do_lower_case": True},
"google/electra-base-discriminator": {"do_lower_case": True},
"google/electra-large-discriminator": {"do_lower_case": True},
}
class ElectraTokenizerFast(BertTokenizerFast):
r"""
Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = ElectraTokenizer

View File

@ -0,0 +1,100 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from transformers.file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config"],
"tokenization_gpt2": ["GPT2Tokenizer"],
}
if is_tokenizers_available():
_import_structure["tokenization_gpt2_fast"] = ["GPT2TokenizerFast"]
if is_torch_available():
_import_structure["modeling_gpt2"] = [
"GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPT2DoubleHeadsModel",
"GPT2ForSequenceClassification",
"GPT2LMHeadModel",
"GPT2Model",
"GPT2PreTrainedModel",
"load_tf_weights_in_gpt2",
]
if is_tf_available():
_import_structure["modeling_tf_gpt2"] = [
"TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFGPT2DoubleHeadsModel",
"TFGPT2ForSequenceClassification",
"TFGPT2LMHeadModel",
"TFGPT2MainLayer",
"TFGPT2Model",
"TFGPT2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
from .tokenization_gpt2 import GPT2Tokenizer
if is_tokenizers_available():
from .tokenization_gpt2_fast import GPT2TokenizerFast
if is_torch_available():
from .modeling_gpt2 import (
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
GPT2DoubleHeadsModel,
GPT2ForSequenceClassification,
GPT2LMHeadModel,
GPT2Model,
GPT2PreTrainedModel,
load_tf_weights_in_gpt2,
)
if is_tf_available():
from .modeling_tf_gpt2 import (
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
TFGPT2DoubleHeadsModel,
TFGPT2ForSequenceClassification,
TFGPT2LMHeadModel,
TFGPT2MainLayer,
TFGPT2Model,
TFGPT2PreTrainedModel,
)
else:
import importlib
import os
import sys
class _LazyModule(_BaseLazyModule):
"""
Module class that surfaces all objects but only performs associated imports when the objects are requested.
"""
__file__ = globals()["__file__"]
__path__ = [os.path.dirname(__file__)]
def _get_module(self, module_name: str):
return importlib.import_module("." + module_name, self.__name__)
sys.modules[__name__] = _LazyModule(__name__, _import_structure)

View File

@ -0,0 +1,198 @@
# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" OpenAI GPT-2 configuration """
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
"gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
"gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
"gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
"distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
}
class GPT2Config(PretrainedConfig):
"""
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a
:class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 50257):
Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or
:class:`~transformers.TFGPT2Model`.
n_positions (:obj:`int`, `optional`, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, `optional`, defaults to 1024):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
n_inner (:obj:`int`, `optional`, defaults to None):
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`):
Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
and :class:`~transformers.TFGPT2DoubleHeadsModel`.
Has to be one of the following options:
- :obj:`"last"`: Take the last token hidden state (like XLNet).
- :obj:`"first"`: Take the first token hidden state (like BERT).
- :obj:`"mean"`: Take the mean of all tokens hidden states.
- :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
- :obj:`"attn"`: Not implemented now, use multi-head attention.
summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
and :class:`~transformers.TFGPT2DoubleHeadsModel`.
Whether or not to add a projection after the vector extraction.
summary_activation (:obj:`str`, `optional`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
and :class:`~transformers.TFGPT2DoubleHeadsModel`.
Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
and :class:`~transformers.TFGPT2DoubleHeadsModel`.
The dropout ratio to be used after the projection and activation.
scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
Scale attention weights by dividing by sqrt(hidden_size).
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models).
Example::
>>> from transformers import GPT2Model, GPT2Config
>>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config()
>>> # Initializing a model from the configuration
>>> model = GPT2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "gpt2"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=50257,
n_positions=1024,
n_ctx=1024,
n_embd=768,
n_layer=12,
n_head=12,
n_inner=None,
activation_function="gelu_new",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
scale_attn_weights=True,
gradient_checkpointing=False,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
**kwargs
):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.n_ctx = n_ctx
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.n_inner = n_inner
self.activation_function = activation_function
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
self.gradient_checkpointing = gradient_checkpointing
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer

View File

@ -0,0 +1,68 @@
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""
import argparse
import torch
from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
from transformers.utils import logging
logging.set_verbosity_info()
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
# Construct model
if gpt2_config_file == "":
config = GPT2Config()
else:
config = GPT2Config.from_json_file(gpt2_config_file)
model = GPT2Model(config)
# Load weights from numpy
load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
# Save pytorch-model
pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
print(f"Save PyTorch model to {pytorch_weights_dump_path}")
torch.save(model.state_dict(), pytorch_weights_dump_path)
print(f"Save configuration file to {pytorch_config_dump_path}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(config.to_json_string())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--gpt2_config_file",
default="",
type=str,
help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
"This specifies the model architecture.",
)
args = parser.parse_args()
convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,309 @@
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
import json
import os
from functools import lru_cache
from typing import TYPE_CHECKING, List, Optional, Tuple
import regex as re
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging
if TYPE_CHECKING:
from transformers.pipelines.conversational import Conversation
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
"gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
"gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
"gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
"distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
},
"merges_file": {
"gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
"gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
"gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
"gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
"distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"gpt2": 1024,
"gpt2-medium": 1024,
"gpt2-large": 1024,
"gpt2-xl": 1024,
"distilgpt2": 1024,
}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2 ** 8):
if b not in bs:
bs.append(b)
cs.append(2 ** 8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class GPT2Tokenizer(PreTrainedTokenizer):
"""
Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
::
>>> from transformers import GPT2Tokenizer
>>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
>>> tokenizer("Hello world")['input_ids']
[15496, 995]
>>> tokenizer(" Hello world")['input_ids']
[18435, 995]
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
.. note::
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
one).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
super().__init__(
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
input_ids = []
for is_user, text in conversation.iter_texts():
input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
if len(input_ids) > self.model_max_length:
input_ids = input_ids[-self.model_max_length :]
return input_ids

View File

@ -0,0 +1,187 @@
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
import json
from typing import TYPE_CHECKING, List, Optional, Tuple
from tokenizers import pre_tokenizers
from transformers.tokenization_utils_base import BatchEncoding
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers.utils import logging
from .tokenization_gpt2 import GPT2Tokenizer
if TYPE_CHECKING:
from transformers.pipelines.conversational import Conversation
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
"gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
"gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
"gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
"distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
},
"merges_file": {
"gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
"gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
"gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
"gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
"distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
},
"tokenizer_file": {
"gpt2": "https://huggingface.co/gpt2/resolve/main/tokenizer.json",
"gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/tokenizer.json",
"gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/tokenizer.json",
"gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/tokenizer.json",
"distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"gpt2": 1024,
"gpt2-medium": 1024,
"gpt2-large": 1024,
"gpt2-xl": 1024,
"distilgpt2": 1024,
}
class GPT2TokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
::
>>> from transformers import GPT2TokenizerFast
>>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
>>> tokenizer("Hello world")['input_ids']
[15496, 995]
>>> tokenizer(" Hello world")['input_ids']
[18435, 995]
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
.. note::
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
``add_prefix_space=True``.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = GPT2Tokenizer
def __init__(
self,
vocab_file,
merges_file,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
"""This corresponds to DialoGPT variants of models."""
input_ids = []
for is_user, text in conversation.iter_texts():
input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
if len(input_ids) > self.model_max_length:
input_ids = input_ids[-self.model_max_length :]
return input_ids

View File

@ -0,0 +1,134 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from transformers.file_utils import (
_BaseLazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig"],
"tokenization_roberta": ["RobertaTokenizer"],
}
if is_tokenizers_available():
_import_structure["tokenization_roberta_fast"] = ["RobertaTokenizerFast"]
if is_torch_available():
_import_structure["modeling_roberta"] = [
"ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
"RobertaForCausalLM",
"RobertaForMaskedLM",
"RobertaForMultipleChoice",
"RobertaForQuestionAnswering",
"RobertaForSequenceClassification",
"RobertaForTokenClassification",
"RobertaModel",
]
if is_tf_available():
_import_structure["modeling_tf_roberta"] = [
"TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFRobertaForMaskedLM",
"TFRobertaForMultipleChoice",
"TFRobertaForQuestionAnswering",
"TFRobertaForSequenceClassification",
"TFRobertaForTokenClassification",
"TFRobertaMainLayer",
"TFRobertaModel",
"TFRobertaPreTrainedModel",
]
if is_flax_available():
_import_structure["modeling_flax_roberta"] = [
"FlaxRobertaForMaskedLM",
"FlaxRobertaForMultipleChoice",
"FlaxRobertaForQuestionAnswering",
"FlaxRobertaForSequenceClassification",
"FlaxRobertaForTokenClassification",
"FlaxRobertaModel",
"FlaxRobertaPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
from .tokenization_roberta import RobertaTokenizer
if is_tokenizers_available():
from .tokenization_roberta_fast import RobertaTokenizerFast
if is_torch_available():
from .modeling_roberta import (
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
RobertaForCausalLM,
RobertaForMaskedLM,
RobertaForMultipleChoice,
RobertaForQuestionAnswering,
RobertaForSequenceClassification,
RobertaForTokenClassification,
RobertaModel,
)
if is_tf_available():
from .modeling_tf_roberta import (
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFRobertaForMaskedLM,
TFRobertaForMultipleChoice,
TFRobertaForQuestionAnswering,
TFRobertaForSequenceClassification,
TFRobertaForTokenClassification,
TFRobertaMainLayer,
TFRobertaModel,
TFRobertaPreTrainedModel,
)
if is_flax_available():
from .modeling_tf_roberta import (
FlaxRobertaForMaskedLM,
FlaxRobertaForMultipleChoice,
FlaxRobertaForQuestionAnswering,
FlaxRobertaForSequenceClassification,
FlaxRobertaForTokenClassification,
FlaxRobertaModel,
FlaxRobertaPreTrainedModel,
)
else:
import importlib
import os
import sys
class _LazyModule(_BaseLazyModule):
"""
Module class that surfaces all objects but only performs associated imports when the objects are requested.
"""
__file__ = globals()["__file__"]
__path__ = [os.path.dirname(__file__)]
def _get_module(self, module_name: str):
return importlib.import_module("." + module_name, self.__name__)
sys.modules[__name__] = _LazyModule(__name__, _import_structure)

View File

@ -0,0 +1,64 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" RoBERTa configuration """
from transformers.utils import logging
from ..bert.configuration_bert import BertConfig
logger = logging.get_logger(__name__)
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
}
class RobertaConfig(BertConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
:class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
arguments, defining the model architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
same defaults. Please check the parent class for more information.
Examples::
>>> from transformers import RobertaConfig, RobertaModel
>>> # Initializing a RoBERTa configuration
>>> configuration = RobertaConfig()
>>> # Initializing a model from the configuration
>>> model = RobertaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "roberta"
def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
"""Constructs RobertaConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,253 @@
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RoBERTa."""
from typing import List, Optional
from transformers.tokenization_utils import AddedToken
from transformers.utils import logging
from ..gpt2.tokenization_gpt2 import GPT2Tokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
},
"merges_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"roberta-base": 512,
"roberta-large": 512,
"roberta-large-mnli": 512,
"distilroberta-base": 512,
"roberta-base-openai-detector": 512,
"roberta-large-openai-detector": 512,
}
class RobertaTokenizer(GPT2Tokenizer):
"""
Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
::
>>> from transformers import RobertaTokenizer
>>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
>>> tokenizer("Hello world")['input_ids']
[0, 31414, 232, 328, 2]
>>> tokenizer(" Hello world")['input_ids']
[0, 20920, 232, 2]
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
.. note::
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
one).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
**kwargs
):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RoBERTa sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)