Add files via upload

This commit is contained in:
TimelordRi 2021-09-29 22:46:35 +08:00 committed by GitHub
parent 4c0b4558fb
commit 403c7e1904
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 155 additions and 0 deletions

View File

@ -0,0 +1,11 @@
numpy==1.20.3
tokenizers==0.10.3
torch==1.8.0
regex==2021.4.4
transformers==4.7.0
tqdm==4.49.0
activations==0.1.0
dataclasses==0.6
file_utils==0.0.1
flax==0.3.4
utils==1.0.1

144
example/re/few-shot/run.py Normal file
View File

@ -0,0 +1,144 @@
from logging import debug
import numpy as np
import torch
from torch.utils.data.dataloader import DataLoader
import yaml
import time
from lit_models import TransformerLitModelTwoSteps
from transformers import AutoConfig, AutoModel
from transformers.optimization import get_linear_schedule_with_warmup
import os
from tqdm import tqdm
from deepke.src.relation_extraction.few_shot import *
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# In order to ensure reproducible experiments, we must set random seeds.
def logging(log_dir, s, print_=True, log_=True):
if print_:
print(s)
if log_dir != '' and log_:
with open(log_dir, 'a+') as f_log:
f_log.write(s + '\n')
@hydra.main(config_path="conf/config.yaml")
def main(cfg):
get_label_word()
generate_k_shot()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
data = REDataset(cfg)
data_config = data.get_data_config()
config = AutoConfig.from_pretrained(cfg.model_name_or_path)
config.num_labels = data_config["num_labels"]
model = BertForMaskedLM.from_pretrained(cfg.model_name_or_path, config=config)
if cfg.train_from_saved_model != '':
model.load_state_dict(torch.load(cfg.train_from_saved_model)["checkpoint"])
print("load saved model from {}.".format(cfg.train_from_saved_model))
# if torch.cuda.device_count() > 1:
# print("Let's use", torch.cuda.device_count(), "GPUs!")
# model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count())))
model.to(device)
cur_model = model.module if hasattr(model, 'module') else model
if "gpt" in cfg.model_name_or_path or "roberta" in cfg.model_name_or_path:
tokenizer = data.get_tokenizer()
cur_model.resize_token_embeddings(len(tokenizer))
cur_model.update_word_idx(len(tokenizer))
if "Use" in cfg.model_class:
continous_prompt = [a[0] for a in tokenizer([f"[T{i}]" for i in range(1,3)], add_special_tokens=False)['input_ids']]
continous_label_word = [a[0] for a in tokenizer([f"[class{i}]" for i in range(1, data.num_labels+1)], add_special_tokens=False)['input_ids']]
discrete_prompt = [a[0] for a in tokenizer(['It', 'was'], add_special_tokens=False)['input_ids']]
dataset_name = cfg.data_dir.split("/")[1]
model.init_unused_weights(continous_prompt, continous_label_word, discrete_prompt, label_path=f"{cfg.model_name_or_path}_{dataset_name}.pt")
lit_model = BertLitModel(cfg=cfg, model=model, tokenizer=data.tokenizer, device=device)
if cfg.train_from_saved_model != '':
lit_model.best_f1 = torch.load(cfg.train_from_saved_model)["best_f1"]
data.tokenizer.save_pretrained('test')
data.setup()
optimizer = lit_model.configure_optimizers()
if cfg.train_from_saved_model != '':
optimizer.load_state_dict(torch.load(cfg.train_from_saved_model)["optimizer"])
print("load saved optimizer from {}.".format(cfg.train_from_saved_model))
num_training_steps = len(data.train_dataloader()) // cfg.gradient_accumulation_steps * cfg.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_training_steps * 0.1, num_training_steps=num_training_steps)
log_step = 100
logging(cfg.log_dir,'-' * 89, print_=False)
logging(cfg.log_dir, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' INFO : START TO TRAIN ', print_=False)
logging(cfg.log_dir,'-' * 89, print_=False)
for epoch in range(cfg.num_train_epochs):
model.train()
num_batch = len(data.train_dataloader())
total_loss = 0
log_loss = 0
for index, train_batch in enumerate(tqdm(data.train_dataloader())):
loss = lit_model.training_step(train_batch, index)
total_loss += loss.item()
log_loss += loss.item()
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
if log_step > 0 and (index+1) % log_step == 0:
cur_loss = log_loss / log_step
logging(cfg.log_dir,
'| epoch {:2d} | step {:4d} | lr {} | train loss {:5.3f}'.format(
epoch, (index+1), scheduler.get_last_lr(), cur_loss * 1000)
, print_=False)
log_loss = 0
avrg_loss = total_loss / num_batch
logging(cfg.log_dir,
'| epoch {:2d} | train loss {:5.3f}'.format(
epoch, avrg_loss * 1000))
model.eval()
with torch.no_grad():
val_loss = []
for val_index, val_batch in enumerate(tqdm(data.val_dataloader())):
loss = lit_model.validation_step(val_batch, val_index)
val_loss.append(loss)
f1, best, best_f1 = lit_model.validation_epoch_end(val_loss)
logging(cfg.log_dir,'-' * 89)
logging(cfg.log_dir,
'| epoch {:2d} | dev_result: {}'.format(epoch, f1))
logging(cfg.log_dir,'-' * 89)
logging(cfg.log_dir,
'| best_f1: {}'.format(best_f1))
logging(cfg.log_dir,'-' * 89)
if cfg.save_path != "" and best != -1:
save_path = cfg.save_path
torch.save({
'epoch': epoch,
'checkpoint': cur_model.state_dict(),
'best_f1': best_f1,
'optimizer': optimizer.state_dict()
}, save_path
, _use_new_zipfile_serialization=False)
logging(cfg.log_dir,
'| successfully save model at: {}'.format(save_path))
logging(cfg.log_dir,'-' * 89)
if __name__ == "__main__":
main()