Add files via upload

This commit is contained in:
TimelordRi 2021-10-10 19:18:44 +08:00 committed by GitHub
parent 1f0dbb0a4f
commit 4024d91226
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 11110 additions and 0 deletions

View File

@ -0,0 +1,52 @@
## 快速上手
### 环境依赖
> python == 3.8
- torch == 1.5
- transformers == 3.4.0
- hydra-core == 1.0.6
- deepke
### 克隆代码
```
git clone git@github.com:zjunlp/DeepKE.git
```
### 使用pip安装
首先创建python虚拟环境再进入虚拟环境
- 安装依赖: ```pip install -r requirements.txt```
### 使用数据进行训练预测
- 存放数据:在 `data` 文件夹下存放训练数据。模型采用的数据集是[SEMEVAL](https://semeval2.fbk.eu/semeval2.php?location=tasks#T11)SEMEVAL数据集来自于2010年的国际语义评测大会中Task 8"Multi-Way Classification of Semantic Relations Between Pairs of Nominals"。
- SEMEVAL包含以下数据
- `rel2id.json`关系标签到ID的映射
- `temp.txt`:关系标签处理
- `test.txt` 测试集
- `train.txt`:训练集
- `val.txt`:验证集
- 开始训练模型加载和保存位置以及配置可以在conf的`.yaml`文件中修改
- 对数据集SEMEVAL进行few-shot训练`python run.py`
- 训练好的模型默认保存在根目录
- 从上次训练的模型开始训练:设置`.yaml`中的train_from_saved_model为上次保存模型的路径
- 每次训练的日志保存路径默认保存在根目录,可以通过`.yaml`中的log_dir来配置
- 进行预测: `python predict.py `
## 模型内容
KnowPrompt

View File

@ -0,0 +1,3 @@
defaults:
- hydra/output: custom
- train

View File

@ -0,0 +1,11 @@
hydra:
run:
# Output directory for normal runs
dir: logs/${now:%Y-%m-%d_%H-%M-%S}
sweep:
# Output directory for sweep runs
dir: logs/${now:%Y-%m-%d_%H-%M-%S}
# Output sub directory for sweep runs.
subdir: ${hydra.job.num}_${hydra.job.id}

View File

@ -0,0 +1,83 @@
accelerator: None
accumulate_grad_batches: '1'
amp_backend: 'native'
amp_level: 'O2'
auto_lr_find: False
auto_scale_batch_size: False
auto_select_gpus: False
batch_size: 16
benchmark: False
check_val_every_n_epoch: '3'
checkpoint_callback: True
data_class: 'REDataset'
data_dir: 'data/k-shot/8-1'
default_root_dir: None
deterministic: False
devices: None
distributed_backend: None
fast_dev_run: False
flush_logs_every_n_steps: 100
gpus: None
gradient_accumulation_steps: 1
gradient_clip_algorithm: 'norm'
gradient_clip_val: 0.0
ipus: None
limit_predict_batches: 1.0
limit_test_batches: 1.0
limit_train_batches: 1.0
limit_val_batches: 1.0
litmodel_class: 'BertLitModel'
load_checkpoint: None
log_dir: './model_bert.log'
log_every_n_steps: 50
log_gpu_memory: None
logger: True
lr: 3e-05
lr_2: 3e-05
max_epochs: '30'
max_seq_length: 256
max_steps: None
max_time: None
min_epochs: None
min_steps: None
model_class: 'BertForMaskedLM'
model_name_or_path: 'bert-large-uncased'
move_metrics_to_cpu: False
multiple_trainloader_mode: 'max_size_cycle'
num_nodes: 1
num_processes: 1
num_sanity_val_steps: 2
num_train_epochs: 30
num_workers: 8
optimizer: 'AdamW'
overfit_batches: 0.0
plugins: None
precision: 32
prepare_data_per_node: True
process_position: 0
profiler: None
progress_bar_refresh_rate: None
ptune_k: 7
reload_dataloaders_every_epoch: False
reload_dataloaders_every_n_epochs: 0
replace_sampler_ddp: True
resume_from_checkpoint: None
save_path: './model_bert.pt'
seed: 666
stochastic_weight_avg: False
sync_batchnorm: False
t_lambda: 0.001
task_name: 'wiki80'
terminate_on_nan: False
tpu_cores: None
track_grad_norm: -1
train_from_saved_model: ''
truncated_bptt_steps: None
two_steps: False
use_prompt: True
val_check_interval: 1.0
wandb: False
weight_decay: 0.01
weights_save_path: None
weights_summary: 'top'
load_path: './model_bert.pt'

View File

@ -0,0 +1 @@
{"Component-Whole(e2,e1)": 1, "Other": 0, "Instrument-Agency(e2,e1)": 2, "Member-Collection(e1,e2)": 3, "Cause-Effect(e2,e1)": 4, "Entity-Destination(e1,e2)": 5, "Content-Container(e1,e2)": 6, "Message-Topic(e1,e2)": 7, "Product-Producer(e2,e1)": 8, "Member-Collection(e2,e1)": 9, "Entity-Origin(e1,e2)": 10, "Cause-Effect(e1,e2)": 11, "Component-Whole(e1,e2)": 12, "Message-Topic(e2,e1)": 13, "Product-Producer(e1,e2)": 14, "Entity-Origin(e2,e1)": 15, "Content-Container(e2,e1)": 16, "Instrument-Agency(e1,e2)": 17, "Entity-Destination(e2,e1)": 18}

View File

@ -0,0 +1,19 @@
0 Other nothing has nothing to nothing
0 Member-Collection(e1,e2) member member of collection collection
0 Entity-Origin(e1,e2) entity entity of origin origin
0 Cause-Effect(e1,e2) cause cause of effect effect
0 Component-Whole(e1,e2) component component of whole whole
0 Product-Producer(e1,e2) product product of producer producer
0 Instrument-Agency(e1,e2) instrument instrument of agency agency
0 Entity-Destination(e1,e2) entity entity of destination destination
0 Content-Container(e1,e2) content content of container container
0 Message-Topic(e1,e2) message message of topic topic
2 Cause-Effect(e2,e1) effect effect of cause cause
2 Product-Producer(e2,e1) producer producer of product product
2 Component-Whole(e2,e1) whole whole of component component
2 Instrument-Agency(e2,e1) agency agency of instrument instrument
2 Member-Collection(e2,e1) collection collection of member member
2 Message-Topic(e2,e1) topic topic of message message
2 Entity-Origin(e2,e1) origin origin of entity entity
2 Content-Container(e2,e1) container container of content content
2 Entity-Destination(e2,e1) destination destination of entity entity

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,83 @@
from logging import debug
import hydra
from hydra.utils import get_original_cwd
import numpy as np
import torch
from torch.utils.data.dataloader import DataLoader
import yaml
import time
from transformers import AutoConfig, AutoModelForMaskedLM
from transformers.optimization import get_linear_schedule_with_warmup
import os
from tqdm import tqdm
from deepkerefew import *
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# In order to ensure reproducible experiments, we must set random seeds.
def logging(log_dir, s, print_=True, log_=True):
if print_:
print(s)
if log_dir != '' and log_:
with open(log_dir, 'a+') as f_log:
f_log.write(s + '\n')
def test(args, model, lit_model, data):
model.eval()
with torch.no_grad():
test_loss = []
for test_index, test_batch in enumerate(tqdm(data.test_dataloader())):
loss = lit_model.test_step(test_batch, test_index)
test_loss.append(loss)
f1 = lit_model.test_epoch_end(test_loss)
logging(args.log_dir,
'| test_result: {}'.format(f1))
logging(args.log_dir,'-' * 89)
@hydra.main(config_path="conf/config.yaml")
def main(cfg):
cwd = get_original_cwd()
os.chdir(cwd)
if not os.path.exists(f"data/{cfg.model_name_or_path}.pt"):
get_label_word(cfg)
if not os.path.exists(cfg.data_dir):
generate_k_shot(cfg.data_dir)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
data = REDataset(cfg)
data_config = data.get_data_config()
config = AutoConfig.from_pretrained(cfg.model_name_or_path)
config.num_labels = data_config["num_labels"]
model = AutoModelForMaskedLM.from_pretrained(cfg.model_name_or_path, config=config)
# if torch.cuda.device_count() > 1:
# print("Let's use", torch.cuda.device_count(), "GPUs!")
# model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count())))
model.to(device)
lit_model = BertLitModel(args=cfg, model=model, tokenizer=data.tokenizer)
data.setup()
model.load_state_dict(torch.load(cfg.load_path)["checkpoint"], False)
print("load trained model from {}.".format(cfg.load_path))
test(cfg, model, lit_model, data)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,3 @@
torch==1.5
transformers==3.4.0
hydra-core==1.0.6

138
example/re/few-shot/run.py Normal file
View File

@ -0,0 +1,138 @@
from logging import debug
import hydra
from hydra.utils import get_original_cwd
import numpy as np
import torch
from torch.utils.data.dataloader import DataLoader
import yaml
import time
from transformers import AutoConfig, AutoModelForMaskedLM
from transformers.optimization import get_linear_schedule_with_warmup
import os
from tqdm import tqdm
from deepkerefew import *
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def logging(log_dir, s, print_=True, log_=True):
if print_:
print(s)
if log_dir != '' and log_:
with open(log_dir, 'a+') as f_log:
f_log.write(s + '\n')
@hydra.main(config_path="conf/config.yaml")
def main(cfg):
cwd = get_original_cwd()
os.chdir(cwd)
if not os.path.exists(f"data/{cfg.model_name_or_path}.pt"):
get_label_word(cfg)
if not os.path.exists(cfg.data_dir):
generate_k_shot(cfg.data_dir)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
data = REDataset(cfg)
data_config = data.get_data_config()
config = AutoConfig.from_pretrained(cfg.model_name_or_path)
config.num_labels = data_config["num_labels"]
model = AutoModelForMaskedLM.from_pretrained(cfg.model_name_or_path, config=config)
# if torch.cuda.device_count() > 1:
# print("Let's use", torch.cuda.device_count(), "GPUs!")
# model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count())))
model.to(device)
lit_model = BertLitModel(args=cfg, model=model, tokenizer=data.tokenizer)
data.setup()
if cfg.train_from_saved_model != '':
model.load_state_dict(torch.load(cfg.train_from_saved_model)["checkpoint"])
print("load saved model from {}.".format(cfg.train_from_saved_model))
lit_model.best_f1 = torch.load(cfg.train_from_saved_model)["best_f1"]
#data.tokenizer.save_pretrained('test')
optimizer = lit_model.configure_optimizers()
if cfg.train_from_saved_model != '':
optimizer.load_state_dict(torch.load(cfg.train_from_saved_model)["optimizer"])
print("load saved optimizer from {}.".format(cfg.train_from_saved_model))
num_training_steps = len(data.train_dataloader()) // cfg.gradient_accumulation_steps * cfg.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_training_steps * 0.1, num_training_steps=num_training_steps)
log_step = 100
logging(cfg.log_dir,'-' * 89, print_=False)
logging(cfg.log_dir, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' INFO : START TO TRAIN ', print_=False)
logging(cfg.log_dir,'-' * 89, print_=False)
for epoch in range(cfg.num_train_epochs):
model.train()
num_batch = len(data.train_dataloader())
total_loss = 0
log_loss = 0
for index, train_batch in enumerate(tqdm(data.train_dataloader())):
loss = lit_model.training_step(train_batch, index)
total_loss += loss.item()
log_loss += loss.item()
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
if log_step > 0 and (index+1) % log_step == 0:
cur_loss = log_loss / log_step
logging(cfg.log_dir,
'| epoch {:2d} | step {:4d} | lr {} | train loss {:5.3f}'.format(
epoch, (index+1), scheduler.get_last_lr(), cur_loss * 1000)
, print_=False)
log_loss = 0
avrg_loss = total_loss / num_batch
logging(cfg.log_dir,
'| epoch {:2d} | train loss {:5.3f}'.format(
epoch, avrg_loss * 1000))
model.eval()
with torch.no_grad():
val_loss = []
for val_index, val_batch in enumerate(tqdm(data.val_dataloader())):
loss = lit_model.validation_step(val_batch, val_index)
val_loss.append(loss)
f1, best, best_f1 = lit_model.validation_epoch_end(val_loss)
logging(cfg.log_dir,'-' * 89)
logging(cfg.log_dir,
'| epoch {:2d} | dev_result: {}'.format(epoch, f1))
logging(cfg.log_dir,'-' * 89)
logging(cfg.log_dir,
'| best_f1: {}'.format(best_f1))
logging(cfg.log_dir,'-' * 89)
if cfg.save_path != "" and best != -1:
save_path = cfg.save_path
torch.save({
'epoch': epoch,
'checkpoint': model.state_dict(),
'best_f1': best_f1,
'optimizer': optimizer.state_dict()
}, save_path
, _use_new_zipfile_serialization=False)
logging(cfg.log_dir,
'| successfully save model at: {}'.format(save_path))
logging(cfg.log_dir,'-' * 89)
if __name__ == "__main__":
main()