commit 14f85eacd67c46c076f9041420675dd0a5be18b2 Author: leo Date: Tue Aug 20 21:25:34 2019 +0800 init diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..fe13d48 --- /dev/null +++ b/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,13 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free experience for everyone, regardless of the level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers. + +This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), version 1.0.0, available at [http://contributor-covenant.org/version/1/0/0/](http://contributor-covenant.org/version/1/0/0/) \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..1b39691 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,13 @@ + + + +**What kind of change does this PR introduce?** (check at least one) + +- [ ] Bugfix +- [ ] Feature +- [ ] Code style update +- [ ] Refactor +- [ ] Build-related changes +- [ ] Other, please describe: + +**Other information:**w \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f8fec36 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +.DS_Store + +.idea +.vscode + +__pycache__ +*.pyc + + +checkpoints + +demo.py +predict.py + +pytorch_transformers +bert_parameters diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f49a4e1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5e15ec5 --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ +# Deepke + +deepke 是基于 Pytorch 的中文关系抽取处理套件。 + +## 环境依赖: + +- python >= 3.6 +- torch >=1.0 +- jieba >= 0.39 +- scikit_learn >= 0.21 +- pytorch_transformers>=1.0 + + +## 主要目录 + +``` +├── checkpoints # 保存训练后的模型参数 +├── data # 数据目录 +│ ├── origin # 训练使用的原始数据集 +│ ├── train.csv # 训练数据集 +│ ├── test.csv # 测试数据集 +│ ├── relation.txt # 关系种类 +├── model # 模型目录 +│ ├── __init__.py +│ ├── BasicModule.py # 模型基本配置 +│ ├── Embedding.py # Embeddding 模块 +│ ├── CNN.py # CNN & PCNN 模型 +│ ├── BiLSTM.py # BiLSTM 模型 +│ ├── Transformer.py # Transformer 模型 +│ ├── Capsule.py # Capsule 模型 +│ ├── Bert.py # 语言预训练 模型 +├── src +│ ├── config.py # 配置文件 +│ ├── vocab.py # 词汇表构建函数 +│ ├── process.py # 训练前预处理数据 +│ ├── dataset.py # 训练时批处理输入数据 +│ ├── trainer.py # 训练迭代函数 +│ ├── utils.py # 工具函数 +├── main.py # 主入口文件 +├── README.md # read me 文件 +``` + +## 快速开始 + +数据为 csv 文件,样式范例为: + + +sentence|relation|head|head_type|head_offset|tail|tail_type|tail_offset +:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---: +谢万松,字树人,湖北省武汉市人,武汉钢铁集团公司联合焦化公司退体职工,生于1940年|出生地|谢万松|人物|0|湖北省武汉市|地点|8 +《娘家的故事第二部》是张玲执导,林在培、何赛飞等主演的电视剧|导演|娘家的故事第二部|影视作品|1|张玲|人物|11 +九玄珠是在纵横中文网连载的一部小说,作者是龙马|连载网站|九玄珠|网络小说|0|纵横中文网|网站|5 +个人简介梁信强,男,2010年广州亚运会中国澳门代表团成员|国籍|梁信强|人物|4|中国|国家|20 + +- 安装依赖: `pip install -r requirements.txt` + +- 存放数据:在 `data/origin` 文件夹下存放训练数据。训练文件主要有三个文件。 + + - `train.csv`:存放训练数据集 + + - `valid.csv`:存放验证数据集 + + - `relation.txt`:存放关系种类 + +- 开始训练:python main.py + +- 每次训练的结果会保存在 `checkpoints` 文件夹下,格式为:`{model_name}_{epoch}_{time}.pth`。 + +## 具体介绍 + +见 [wiki](https://github.com/zjunlp/deepke/wiki) diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/deepke/__init__.py b/deepke/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deepke/config.py b/deepke/config.py new file mode 100644 index 0000000..59c19ce --- /dev/null +++ b/deepke/config.py @@ -0,0 +1,96 @@ +# 原始文件位置 + +class Config(object): + data_path = 'data/origin' + # 预处理后存放文件的位置 + out_path = 'data/out' + + # 是否为中文数据 + is_chinese = True + # 是否需要分词操作 + word_segment = True + + # 关系种类 + relation_type = 10 + + # vocab 构建时最低词频控制 + min_freq = 2 + + # position embedding + pos_limit = 50 # [-50, 50] + pos_size = 102 # 2 * pos_limit + 2 + + # model name + # (CNN, BiLSTM, Transformer, Capsule, Bert) + model_name = 'CNN' + + # model + word_dim = 50 + pos_dim = 5 + + # feature_dim = 50 + 5 * 2 + hidden_dim = 100 + dropout = 0.3 + + # PCNN config + use_pcnn = True + out_channels = 100 + kernel_size = [3, 5] + + # BiLSTM + lstm_layers = 2 + last_hn = False + + # Transformer + transformer_layers = 2 + + # Capsule + num_primary_units=8 + num_output_units=10 # relation_type + primary_channels=1 + primary_unit_size=768 + output_unit_size=128 + num_iterations=5 + + # Bert + lm_name = 'bert-base-chinese' + + # train + seed = 1 + use_gpu = True + gpu_id = 3 + epoch = 30 + learning_rate = 1e-3 + decay_rate = 0.5 + decay_patience = 3 + batch_size = 64 + train_log = True + log_interval = 10 + show_plot = False + f1_norm = ['macro', 'micro'] + + + + +def parse(self, kwargs): + ''' + user can update the default hyperparamter + ''' + for k, v in kwargs.items(): + if not hasattr(self, k): + raise Exception('opt has No key: {}'.format(k)) + setattr(self, k, v) + + + print('*************************************************') + print('user config:') + for k, v in kwargs.items(): + if not k.startswith('__'): + print("{} => {}".format(k, getattr(self, k))) + + print('*************************************************') + + +Config.parse = parse + +config =Config() diff --git a/deepke/dataset.py b/deepke/dataset.py new file mode 100644 index 0000000..e3db07b --- /dev/null +++ b/deepke/dataset.py @@ -0,0 +1,90 @@ +import torch +from torch.utils.data import Dataset +from deepke.utils import load_pkl + + +class CustomLMDataset(Dataset): + def __init__(self, fp): + self.file = load_pkl(fp) + + def __getitem__(self, item): + sample = self.file[item] + return sample + + def __len__(self): + return len(self.file) + + +def collate_fn_lm(batch): + batch.sort(key=lambda data: len(data[0]), reverse=True) + lens = [len(data[0]) for data in batch] + max_len = max(lens) + + def _padding(x, max_len): + return x + [0] * (max_len - len(x)) + + sent_arr = [] + y_arr = [] + for data in batch: + sent, data_y = data + sent_arr.append(_padding(sent, max_len)) + y_arr.append(data_y) + return torch.tensor(sent_arr), torch.tensor(y_arr) + + +class CustomDataset(Dataset): + def __init__(self, fp): + self.file = load_pkl(fp) + + def __getitem__(self, item): + sample = self.file[item] + return sample + + def __len__(self): + return len(self.file) + + +def collate_fn(batch): + batch.sort(key=lambda data: len(data[0]), reverse=True) + lens = [len(data[0]) for data in batch] + max_len = max(lens) + + def _padding(x, max_len): + return x + [0] * (max_len - len(x)) + + sent_arr = [] + head_pos_arr = [] + tail_pos_arr = [] + mask_arr = [] + y_arr = [] + for data in batch: + sent, head_pos, tail_pos, mask, data_y = data + sent_arr.append(_padding(sent, max_len)) + head_pos_arr.append(_padding(head_pos, max_len)) + tail_pos_arr.append(_padding(tail_pos, max_len)) + mask_arr.append(_padding(mask, max_len)) + y_arr.append(data_y) + return torch.tensor(sent_arr), torch.tensor(head_pos_arr), torch.tensor( + tail_pos_arr), torch.tensor(mask_arr), torch.tensor(y_arr) + + +if __name__ == '__main__': + from torch.utils.data import DataLoader + vocab_path = 'data/out/vocab.pkl' + train_data_path = 'data/out/train.pkl' + vocab = load_pkl(vocab_path) + + train_dataset = CustomDataset(train_data_path) + dataloader = DataLoader(train_dataset, + batch_size=4, + shuffle=False, + collate_fn=collate_fn) + for idx, (*x, y) in enumerate(dataloader): + sent, head_pos, tail_pos, mask = x + + raw_sents = [] + for i in range(4): + raw_sent = [vocab.idx2word[i] for i in sent[i].numpy()] + raw_sents.append(''.join(raw_sent)) + print(raw_sents, head_pos, tail_pos, mask, y, sep='\n\n') + break diff --git a/deepke/model/BasicModule.py b/deepke/model/BasicModule.py new file mode 100644 index 0000000..9feaa20 --- /dev/null +++ b/deepke/model/BasicModule.py @@ -0,0 +1,33 @@ +import torch +import torch.nn as nn +import time +from deepke.utils import ensure_dir + +class BasicModule(nn.Module): + ''' + 封装nn.Module, 提供 save 和 load 方法 + ''' + def __init__(self): + super(BasicModule, self).__init__() + self.model_name = str(type(self)) + + def load(self, path): + ''' + 加载指定路径的模型 + ''' + self.load_state_dict(torch.load(path)) + + def save(self, epoch=0, name=None): + ''' + 保存模型,默认使用“模型名字+时间”作为文件名 + ''' + prefix = 'checkpoints/' + ensure_dir(prefix) + if name is None: + name = prefix + self.model_name + '_' + f'epoch{epoch}_' + name = time.strftime(name + '%m%d_%H:%M:%S.pth') + else: + name = prefix + name + '_'+ self.model_name + '_' + f'epoch{epoch}_' + name = time.strftime(name + '%m%d_%H:%M:%S.pth') + torch.save(self.state_dict(), name) + return name diff --git a/deepke/model/Bert.py b/deepke/model/Bert.py new file mode 100644 index 0000000..b9d1db1 --- /dev/null +++ b/deepke/model/Bert.py @@ -0,0 +1,19 @@ +import torch.nn as nn +from deepke.model import BasicModule +from pytorch_transformers import BertModel + + +class Bert(BasicModule): + def __init__(self, vocab_size, config): + super(Bert, self).__init__() + self.model_name = 'Bert' + self.lm_name = config.lm_name + self.out_dim = config.relation_type + + self.lm = BertModel.from_pretrained(self.lm_name) + self.fc = nn.Linear(768, self.out_dim) + + def forward(self, x): + out = self.lm(x)[-1] + out = self.fc(out) + return out diff --git a/deepke/model/BiLSTM.py b/deepke/model/BiLSTM.py new file mode 100644 index 0000000..5d2d8e9 --- /dev/null +++ b/deepke/model/BiLSTM.py @@ -0,0 +1,112 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from deepke.model import BasicModule, Embedding + + +class VarLenLSTM(BasicModule): + def __init__(self, + input_size, + hidden_size, + lstm_layers=1, + dropout=0, + last_hn=False): + super(VarLenLSTM, self).__init__() + self.model_name = 'VarLenLSTM' + self.lstm_layers = lstm_layers + self.last_hn = last_hn + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=lstm_layers, + dropout=dropout, + bidirectional=True, + bias=True, + batch_first=True, + ) + + def forward(self, x, x_len): + ''' + 针对有 padding 的句子 + 一般来说,out 用来做序列标注,hn 做分类任务 + :param x: [B * L * H] + :param x_len: [l...] + :return: + out: [B * seq_len * hidden] hidden = 2 * hidden_dim + hn: [B * layers * hidden] hidden = 2 * hidden_dim + ''' + x = pack_padded_sequence(x, + x_len, + batch_first=True, + enforce_sorted=True) + out, (hn, _) = self.lstm(x) + out, _ = pad_packed_sequence(out, batch_first=True, padding_value=0.0) + hn = hn.transpose(0, 1).contiguous() + # [B, layers, 2*hidden] + hn = hn.view(hn.size(0), self.lstm_layers, -1) + if self.last_hn: + hn = hn[:, -1].unsqueeze(1) + + return out, hn + + +class BiLSTM(BasicModule): + def __init__(self, vocab_size, config): + super(BiLSTM, self).__init__() + self.model_name = 'BiLSTM' + self.word_dim = config.word_dim + self.pos_size = config.pos_size + self.pos_dim = config.pos_dim + self.hidden_dim = config.hidden_dim + self.lstm_layers = config.lstm_layers + self.last_hn = config.last_hn + self.out_dim = config.relation_type + self.dropout = config.dropout + + self.embedding = Embedding(vocab_size, self.word_dim, self.pos_size, + self.pos_dim) + self.input_dim = self.word_dim + self.pos_dim * 2 + self.lstm = VarLenLSTM(self.input_dim, + self.hidden_dim, + self.lstm_layers, + dropout=self.dropout, + last_hn=self.last_hn) + if self.last_hn: + linear_input_dim = self.hidden_dim * 2 + else: + linear_input_dim = self.hidden_dim * 2 * self.lstm_layers + self.fc1 = nn.Linear(linear_input_dim, self.hidden_dim) + self.fc2 = nn.Linear(self.hidden_dim, self.out_dim) + + def forward(self, input): + *x, mask = input + x = self.embedding(x) + x_lens = torch.sum(mask.gt(0), dim=-1) + _, hn = self.lstm(x, x_lens) + hn = hn.view(hn.size(0), -1) + y = F.leaky_relu(self.fc1(hn)) + y = F.leaky_relu(self.fc2(y)) + return y + + +if __name__ == '__main__': + torch.manual_seed(1) + x = torch.tensor([ + [1, 2, 3, 4, 3, 2], + [1, 2, 3, 0, 0, 0], + [2, 4, 3, 0, 0, 0], + [2, 3, 0, 0, 0, 0], + ]) + x_len = torch.tensor([6, 3, 3, 2]) + embedding = nn.Embedding(5, 10, padding_idx=0) + model = VarLenLSTM(input_size=10, + hidden_size=30, + lstm_layers=5, + last_hn=False) + + x = embedding(x) # [4, 6, 5] + out, hn = model(x, x_len) + # out: [4, 6, 60] [B, seq_len, 2 * hidden] + # hn: [4, 5, 60] [B, layers, 2 * hidden] + print(out.shape, hn.shape) diff --git a/deepke/model/CNN.py b/deepke/model/CNN.py new file mode 100644 index 0000000..a260326 --- /dev/null +++ b/deepke/model/CNN.py @@ -0,0 +1,89 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from deepke.model import BasicModule, Embedding + + +class CNN(BasicModule): + def __init__(self, vocab_size, config): + super(CNN, self).__init__() + self.model_name = 'CNN' + self.out_channels = config.out_channels + self.kernel_size = config.kernel_size + self.word_dim = config.word_dim + self.pos_size = config.pos_size + self.pos_dim = config.pos_dim + self.use_pcnn = config.use_pcnn + self.hidden_dim = config.hidden_dim + self.out_dim = config.relation_type + self.dropout = config.dropout + + if isinstance(self.kernel_size, int): + self.kernel_size = [self.kernel_size] + for k in self.kernel_size: + assert k % 2 == 1, "kernel size has to be odd numbers." + + self.embedding = Embedding(vocab_size, self.word_dim, self.pos_size, + self.pos_dim) + # PCNN embedding + self.mask_embed = nn.Embedding(4, 3) + masks = torch.tensor([[0, 0, 0], [100, 0, 0], [0, 100, 0], [0, 0, + 100]]) + self.mask_embed.weight.data.copy_(masks) + self.mask_embed.weight.requires_grad = False + + self.input_dim = self.word_dim + self.pos_dim * 2 + self.convs = nn.ModuleList([ + nn.Conv1d(in_channels=self.input_dim, + out_channels=self.out_channels, + kernel_size=k, + padding=k // 2, + bias=None) for k in self.kernel_size + ]) + self.conv_dim = len(self.kernel_size) * self.out_channels + if self.use_pcnn: + self.conv_dim *= 3 + self.fc1 = nn.Linear(self.conv_dim, self.hidden_dim) + self.fc2 = nn.Linear(self.hidden_dim, self.out_dim) + self.dropout = nn.Dropout(self.dropout) + + def forward(self, input): + *x, mask = input + x = self.embedding(x) + mask_embed = self.mask_embed(mask) + + # [B,L,C] -> [B,C,L] + x = torch.transpose(x, 1, 2) + + # CNN + x = [F.leaky_relu(conv(x)) for conv in self.convs] + x = torch.cat(x, dim=1) + + # mask + mask = mask.unsqueeze(1) # B x 1 x L + x = x.masked_fill_(mask.eq(0), float('-inf')) + + if self.use_pcnn: + # triple max_pooling + x = x.unsqueeze(-1).permute(0, 2, 1, 3) # [B, L, C, 1] + mask_embed = mask_embed.unsqueeze(-2) # [B, L, 1, 3] + x = x + mask_embed # [B, L, C, 3] + x = torch.max(x, dim=1)[0] - 100 # [B, C, 3] + x = x.view(x.size(0), -1) # [B, 3*C] + + else: + # max_pooling + x = F.max_pool1d(x, x.size(-1)).squeeze(-1) # [[B,C],..] + + # droup + x = self.dropout(x) + + # linear + x = F.leaky_relu(self.fc1(x)) + x = F.leaky_relu(self.fc2(x)) + + return x + + +if __name__ == '__main__': + pass diff --git a/deepke/model/Capsule.py b/deepke/model/Capsule.py new file mode 100644 index 0000000..89da583 --- /dev/null +++ b/deepke/model/Capsule.py @@ -0,0 +1,212 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from deepke.model import BasicModule, Embedding, VarLenLSTM + + +class Capsule(BasicModule): + def __init__(self, vocab_size, config): + super(Capsule, self).__init__() + self.model_name = 'Capsule' + self.word_dim = config.word_dim + self.pos_size = config.pos_size + self.pos_dim = config.pos_dim + self.hidden_dim = config.hidden_dim + self.dropout = config.dropout + + self.num_primary_units = config.num_primary_units + self.num_output_units = config.num_output_units + self.primary_channels = config.primary_channels + self.primary_unit_size = config.primary_unit_size + self.output_unit_size = config.output_unit_size + self.num_iterations = config.num_iterations + + self.embedding = Embedding(vocab_size, self.word_dim, self.pos_size, + self.pos_dim) + self.input_dim = self.word_dim + self.pos_dim * 2 + self.lstm = VarLenLSTM( + self.input_dim, + self.hidden_dim, + ) + self.capsule = CapsuleNet(self.num_primary_units, + self.num_output_units, self.primary_channels, + self.primary_unit_size, + self.output_unit_size, self.num_iterations) + + def forward(self, input): + *x, mask = input + x = self.embedding(x) + x_lens = torch.sum(mask.gt(0), dim=-1) + _, hn = self.lstm(x, x_lens) + out = self.capsule(hn) + return out # B, num_output_units, output_unit_size + + def predict(self, output): + v_mag = torch.sqrt((output**2).sum(dim=2, keepdim=False)) + pred = v_mag.argmax(1, keepdim=False) + return pred + + def loss(self, input, target, size_average=True): + batch_size = input.size(0) + + v_mag = torch.sqrt((input**2).sum(dim=2, keepdim=True)) + + max_l = torch.relu(0.9 - v_mag).view(batch_size, -1)**2 + max_r = torch.relu(v_mag - 0.1).view(batch_size, -1)**2 + + loss_lambda = 0.5 + T_c = target + L_c = T_c * max_l + loss_lambda * (1.0 - T_c) * max_r + L_c = L_c.sum(dim=1) + + if size_average: + L_c = L_c.mean() + + return L_c + + +class CapsuleNet(nn.Module): + def __init__(self, num_primary_units, num_output_units, primary_channels, + primary_unit_size, output_unit_size, num_iterations): + super(CapsuleNet, self).__init__() + self.primary = CapsuleLayer(in_units=0, + out_units=num_primary_units, + in_channels=primary_channels, + unit_size=primary_unit_size, + use_routing=False, + num_iterations=0) + + self.iteration = CapsuleLayer(in_units=num_primary_units, + out_units=num_output_units, + in_channels=primary_unit_size, + unit_size=output_unit_size, + use_routing=True, + num_iterations=num_iterations) + + def forward(self, input): + return self.iteration(self.primary(input)) + + +class ConvUnit(nn.Module): + def __init__(self, in_channels): + super(ConvUnit, self).__init__() + self.conv0 = nn.Conv1d( + in_channels=in_channels, + out_channels=8, # fixme constant + kernel_size=9, # fixme constant + stride=2, # fixme constant + bias=True) + + def forward(self, x): + return self.conv0(x) + + +class CapsuleLayer(nn.Module): + def __init__(self, in_units, out_units, in_channels, unit_size, + use_routing, num_iterations): + super(CapsuleLayer, self).__init__() + self.in_units = in_units + self.out_units = out_units + self.in_channels = in_channels + self.unit_size = unit_size + self.use_routing = use_routing + + if self.use_routing: + self.W = nn.Parameter( + torch.randn(1, in_channels, out_units, unit_size, in_units)) + self.num_iterations = num_iterations + else: + + def create_conv_unit(unit_idx): + unit = ConvUnit(in_channels=in_channels) + self.add_module("unit_" + str(unit_idx), unit) + return unit + + self.units = [create_conv_unit(i) for i in range(self.out_units)] + + @staticmethod + def squash(s): + # This is equation 1 from the paper. + mag_sq = torch.sum(s**2, dim=2, keepdim=True) + mag = torch.sqrt(mag_sq) + s = (mag_sq / (1.0 + mag_sq)) * (s / mag) + return s + + def forward(self, x): + if self.use_routing: + return self.routing(x) + else: + return self.no_routing(x) + + def no_routing(self, x): + # Each unit will be (batch, channels, feature). + u = [self.units[i](x) for i in range(self.out_units)] + + # Stack all unit outputs (batch, unit, channels, feature). + u = torch.stack(u, dim=1) + + # Flatten to (batch, unit, output). + u = u.view(x.size(0), self.out_units, -1) + + # Return squashed outputs. + return CapsuleLayer.squash(u) + + def routing(self, x): + batch_size = x.size(0) + + # (batch, in_units, features) -> (batch, features, in_units) + x = x.transpose(1, 2) + + # (batch, features, in_units) -> (batch, features, out_units, in_units, 1) + x = torch.stack([x] * self.out_units, dim=2).unsqueeze(4) + + # (batch, features, out_units, unit_size, in_units) + W = torch.cat([self.W] * batch_size, dim=0) + + # Transform inputs by weight matrix. + # (batch_size, features, out_units, unit_size, 1) + u_hat = torch.matmul(W, x) + + # Initialize routing logits to zero. + b_ij = torch.zeros(1, self.in_channels, self.out_units, 1).to(x.device) + + # Iterative routing. + num_iterations = self.num_iterations + for iteration in range(num_iterations): + # Convert routing logits to softmax. + c_ij = F.softmax(b_ij, dim=1) + + # (batch, features, out_units, 1, 1) + c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4) + + # Apply routing (c_ij) to weighted inputs (u_hat). + # (batch_size, 1, out_units, unit_size, 1) + s_j = (c_ij * u_hat).sum(dim=1, keepdim=True) + + # (batch_size, 1, out_units, unit_size, 1) + v_j = CapsuleLayer.squash(s_j) + + # (batch_size, features, out_units, unit_size, 1) + v_j1 = torch.cat([v_j] * self.in_channels, dim=1) + + # (1, features, out_units, 1) + u_vj1 = torch.matmul(u_hat.transpose(3, 4), + v_j1).squeeze(4).mean(dim=0, keepdim=True) + + # Update b_ij (routing) + b_ij = u_vj1 + + # (batch_size, out_units, unit_size, 1) + return v_j.squeeze() + + +if __name__ == '__main__': + net = CapsuleNet(num_primary_units=8, + num_output_units=13, + primary_channels=10, + primary_unit_size=8, + output_unit_size=20, + num_iterations=5) + inputs = torch.randn(4, 10, 10) + outs = net(inputs) + print(outs.shape) # (4, 13, 20) diff --git a/deepke/model/Embedding.py b/deepke/model/Embedding.py new file mode 100644 index 0000000..ac1c735 --- /dev/null +++ b/deepke/model/Embedding.py @@ -0,0 +1,20 @@ +import torch +import torch.nn as nn + + +class Embedding(nn.Module): + def __init__(self, vocab_size: int, word_dim: int, pos_size: int, + pos_dim: int): + super(Embedding, self).__init__() + self.word_embed = nn.Embedding(vocab_size, word_dim, padding_idx=0) + self.head_pos_embed = nn.Embedding(pos_size, pos_dim, padding_idx=0) + self.tail_pos_embed = nn.Embedding(pos_size, pos_dim, padding_idx=0) + + def forward(self, x): + words, head_pos, tail_pos = x + word_embed = self.word_embed(words) + head_embed = self.head_pos_embed(head_pos) + tail_embed = self.tail_pos_embed(tail_pos) + feature_embed = torch.cat([word_embed, head_embed, tail_embed], dim=-1) + + return feature_embed diff --git a/deepke/model/Transformer.py b/deepke/model/Transformer.py new file mode 100644 index 0000000..4a6fad1 --- /dev/null +++ b/deepke/model/Transformer.py @@ -0,0 +1,134 @@ +import math +import torch +import torch.nn as nn +from deepke.model import BasicModule, Embedding + + +class DotAttention(nn.Module): + ''' + \text {Attention }(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}\right) V + ''' + def __init__(self, dropout=0.0): + super(DotAttention, self).__init__() + self.drop = nn.Dropout(dropout) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, Q, K, V, mask_out=None): + """ + :param Q: [batch, seq_len_q, feature_size] + :param K: [batch, seq_len_k, feature_size] + :param V: [batch, seq_len_k, feature_size] + :param mask_out: [batch, 1, seq_len] or [batch, seq_len_q, seq_len_k] + """ + feature_size = Q.size(-1) + scale = math.sqrt(feature_size) + output = torch.matmul(Q, K.transpose(1, 2)) / scale + if mask_out is not None: + output.masked_fill_(mask_out, -1e18) + output = self.softmax(output) + output = self.drop(output) + return torch.matmul(output, V) + + +class MultiHeadAttention(nn.Module): + """ + :param feature_size: int, 输入维度的大小。同时也是输出维度的大小。 + :param num_head: int,head的数量。 + :param dropout: float。 + """ + def __init__(self, feature_size, num_head, dropout=0.2): + super(MultiHeadAttention, self).__init__() + self.feature_size = feature_size + self.num_head = num_head + self.q_in = nn.Linear(feature_size, feature_size * num_head) + self.k_in = nn.Linear(feature_size, feature_size * num_head) + self.v_in = nn.Linear(feature_size, feature_size * num_head) + self.attention = DotAttention(dropout=dropout) + self.out = nn.Linear(feature_size * num_head, feature_size) + + def forward(self, Q, K, V, att_mask_out=None): + """ + :param Q: [batch, seq_len_q, feature_size] + :param K: [batch, seq_len_k, feature_size] + :param V: [batch, seq_len_k, feature_size] + :param seq_mask: [batch, seq_len] + """ + batch, sq, feature = Q.size() + sk = K.size(1) + n_head = self.num_head + # input linear + q = self.q_in(Q).view(batch, sq, n_head, feature) + k = self.k_in(K).view(batch, sk, n_head, feature) + v = self.v_in(V).view(batch, sk, n_head, feature) + + # transpose q, k and v to do batch attention + # [batch, seq_len, num_head, feature] => [num_head*batch, seq_len, feature] + q = q.permute(2, 0, 1, 3).contiguous().view(-1, sq, feature) + k = k.permute(2, 0, 1, 3).contiguous().view(-1, sk, feature) + v = v.permute(2, 0, 1, 3).contiguous().view(-1, sk, feature) + if att_mask_out is not None: + att_mask_out = att_mask_out.repeat(n_head, 1, 1) + att = self.attention(q, k, v, + att_mask_out).view(n_head, batch, sq, feature) + + # concat all heads, do output linear + # [num_head, batch, seq_len, feature] => [batch, seq_len, num_head*feature] + att = att.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1) + output = self.out(att) + return output + + +class Transformer(BasicModule): + def __init__(self, vocab_size, config): + super(Transformer, self).__init__() + self.model_name = 'Transformer' + self.word_dim = config.word_dim + self.pos_size = config.pos_size + self.pos_dim = config.pos_dim + self.hidden_dim = config.hidden_dim + self.dropout = config.dropout + self.out_dim = config.relation_type + self.layers = config.transformer_layers + + self.embedding = Embedding(vocab_size, self.word_dim, self.pos_size, + self.pos_dim) + self.feature_dim = self.word_dim + self.pos_dim * 2 + self.att = MultiHeadAttention(self.feature_dim, num_head=4) + self.norm1 = nn.LayerNorm(self.feature_dim) + self.ffn = nn.Sequential(nn.Linear(self.feature_dim, self.hidden_dim), + nn.ReLU(), + nn.Linear(self.hidden_dim, self.feature_dim), + nn.Dropout(self.dropout)) + self.norm2 = nn.LayerNorm(self.feature_dim) + self.fc = nn.Linear(self.feature_dim, self.out_dim) + + def forward(self, input): + *x, mask = input + x = self.embedding(x) + att_mask_out = mask.eq(0).unsqueeze(1) + + for i in range(self.layers): + attention = self.att(x, x, x, att_mask_out) + norm_att = self.norm1(attention + x) + x = self.ffn(norm_att) + x = self.norm2(x + norm_att) + x = x[:, 0] + out = self.fc(x) + return out + + +if __name__ == '__main__': + torch.manual_seed(1) + + q = torch.randn(32, 50, 100) + k = torch.randn(32, 60, 100) + v = torch.randn(32, 60, 100) + mask = torch.randn(32, 60).unsqueeze(1).gt(0) + + att1 = DotAttention() + out = att1(q, k, v, mask) + print(out.shape) # [32, 50, 100] + + att2 = MultiHeadAttention(feature_size=100, num_head=8) + out = att2(q, k, v, mask) + print(out.shape) # [32, 50, 100] diff --git a/deepke/model/__init__.py b/deepke/model/__init__.py new file mode 100644 index 0000000..2767de7 --- /dev/null +++ b/deepke/model/__init__.py @@ -0,0 +1,7 @@ +from .Embedding import Embedding +from .BasicModule import BasicModule +from .Transformer import Transformer +from .BiLSTM import BiLSTM, VarLenLSTM +from .CNN import CNN +from .Capsule import Capsule +from .Bert import Bert diff --git a/deepke/process.py b/deepke/process.py new file mode 100644 index 0000000..9703ae0 --- /dev/null +++ b/deepke/process.py @@ -0,0 +1,246 @@ +import os +import csv +import json +import torch +import jieba +import logging +from typing import List, Tuple +# self file +from deepke.config import config +from deepke.vocab import Vocab +from deepke.utils import ensure_dir, save_pkl, load_csv, load_jsonld +from pytorch_transformers import BertTokenizer + +jieba.setLogLevel(logging.INFO) + + +def build_lm_data(raw_data: List) -> List: + tokenizer = BertTokenizer.from_pretrained(config.lm_name) + sents = [] + for data in raw_data: + sent = data[0] + sub = data[1] + obj = data[4] + sent = '[CLS]' + sent + '[SEP]' + sub + '[SEP]' + obj + '[SEP]' + input_ids = torch.tensor([tokenizer.encode(sent)]) + sents.append(input_ids) + return sents + + +def mask_feature(entities_pos: List, sen_len: int) -> List: + left = [1] * (entities_pos[0] + 1) + middle = [2] * (entities_pos[1] - entities_pos[0] - 1) + right = [3] * (sen_len - entities_pos[1]) + return left + middle + right + + +def pos_feature(sent_len: int, entity_pos: int, entity_len: int, + pos_limit: int) -> List: + left = list(range(-entity_pos, 0)) + middle = [0] * entity_len + right = list(range(1, sent_len - entity_pos - entity_len + 1)) + pos = left + middle + right + for i, p in enumerate(pos): + if p > pos_limit: + pos[i] = pos_limit + if p < -pos_limit: + pos[i] = -pos_limit + pos = [p + pos_limit + 1 for p in pos] + return pos + + +def build_data(raw_data: List[List], vocab) -> Tuple[List, List, List, List]: + sents = [] + head_pos = [] + tail_pos = [] + mask_pos = [] + + if vocab.name == 'word': + for data in raw_data: + sent = [vocab.word2idx.get(w, 1) for w in data[-2]] + pos = list(range(len(sent))) + head, tail = int(data[-1][0]), int(data[-1][1]) + entities_pos = [head, tail] if tail > head else [tail, head] + head_p = pos_feature(len(sent), head, 1, config.pos_limit) + tail_p = pos_feature(len(sent), tail, 1, config.pos_limit) + mask_p = mask_feature(entities_pos, len(sent)) + sents.append(sent) + head_pos.append(head_p) + tail_pos.append(tail_p) + mask_pos.append(mask_p) + + else: + for data in raw_data: + sent = [vocab.word2idx.get(w, 1) for w in data[0]] + head, tail = int(data[3]), int(data[6]) + head_len, tail_len = len(data[1]), len(data[4]) + entities_pos = [head, tail] if tail > head else [tail, head] + head_p = pos_feature(len(sent), head, head_len, config.pos_limit) + tail_p = pos_feature(len(sent), tail, tail_len, config.pos_limit) + mask_p = mask_feature(entities_pos, len(sent)) + head_pos.append(head_p) + tail_pos.append(tail_p) + mask_pos.append(mask_p) + sents.append(sent) + return sents, head_pos, tail_pos, mask_pos + + +def relation_tokenize(relations: List[str], fp: str) -> List[int]: + rels_arr = [] + rels = {} + out = [] + with open(fp, encoding='utf-8') as f: + for l in f: + rels_arr.append(l.strip()) + for i, rel in enumerate(rels_arr): + rels[rel] = i + for rel in relations: + out.append(rels[rel]) + return out + + +def build_vocab(raw_data: List[List], out_path: str) -> Tuple[Vocab, str]: + if config.word_segment: + vocab = Vocab('word') + for data in raw_data: + vocab.add_sent(data[-2]) + else: + vocab = Vocab('char') + for data in raw_data: + vocab.add_sent(data[0]) + vocab.trim(config.min_freq) + + ensure_dir(out_path) + vocab_path = os.path.join(out_path, 'vocab.pkl') + vocab_txt = os.path.join(out_path, 'vocab.txt') + save_pkl(vocab_path, vocab, 'vocab') + with open(vocab_txt, 'w', encoding='utf-8') as f: + f.write(os.linesep.join([word for word in vocab.word2idx.keys()])) + return vocab, vocab_path + + +def split_sents(raw_data: List[List], verbose: bool = True) -> List[List]: + if verbose: + print('need word segment, use jieba to split sentence') + new_data = [] + jieba.add_word('HEAD') + jieba.add_word('TAIL') + for data in raw_data: + head, tail = data[2], data[5] + sent = data[0].replace(data[1], 'HEAD', 1) + sent = sent.replace(data[4], 'TAIL', 1) + sent = jieba.lcut(sent) + head_pos, tail_pos = sent.index('HEAD'), sent.index('TAIL') + sent[head_pos] = head + sent[tail_pos] = tail + data.append(sent) + data.append([head_pos, tail_pos]) + new_data.append(data) + return new_data + + +def exist_relation(fp: str, file_type: str) -> int: + ''' + 判断文件是否存在关系数据,即判断文件是用来训练还是用来预测 + 当存在关系数据时,返回对应所在的列值(int number >= 0) + 当不存在时,返回 -1 + :param fp: 文件地址 + :return: 数值 + ''' + with open(fp, encoding='utf-8') as f: + if file_type == 'csv': + f = csv.DictReader(f) + for l in f: + if file_type == 'jsonld': + l = json.loads(l) + keys = list(l.keys()) + try: + num = keys.index('relation') + except: + num = -1 + return num + + +def process(data_path: str, out_path: str, file_type: str) -> None: + print('===== start preprocess data =====') + + file_type = file_type.lower() + assert file_type in ['csv', 'jsonld'] + + print('load raw files...') + train_fp = os.path.join(data_path, 'train.' + file_type) + test_fp = os.path.join(data_path, 'test.' + file_type) + relation_fp = os.path.join(data_path, 'relation.txt') + + relation_place = exist_relation(train_fp, file_type) + if file_type == 'csv': + train_raw_data = load_csv(train_fp) + test_raw_data = load_csv(test_fp) + else: + train_raw_data = load_jsonld(train_fp) + test_raw_data = load_jsonld(test_fp) + train_relation = [] + test_relation = [] + if relation_place > -1: + for data in train_raw_data: + train_relation.append(data.pop(relation_place)) + for data in test_raw_data: + test_relation.append(data.pop(relation_place)) + + # 使用语言模型预训练时 + if config.model_name == 'Bert': + train_lm_sents = build_lm_data(train_raw_data) + test_lm_sents = build_lm_data(test_raw_data) + + # 当为中文时是否需要分词操作,如果sentence已经为分词的结果,则不需要分词 + print('\nverify whether need split words...') + if config.is_chinese and config.word_segment: + train_raw_data = split_sents(train_raw_data) + test_raw_data = split_sents(test_raw_data, verbose=False) + + print('build sentence vocab...') + vocab, vocab_path = build_vocab(train_raw_data, out_path) + + print('\nbuild train data...') + train_sents, train_head_pos, train_tail_pos, train_mask_pos = build_data( + train_raw_data, vocab) + print('build test data...') + test_sents, test_head_pos, test_tail_pos, test_mask_pos = build_data( + test_raw_data, vocab) + print('build relation data...\n') + train_rel_tokens = relation_tokenize(train_relation, relation_fp) + test_rel_tokens = relation_tokenize(test_relation, relation_fp) + + train_data = list( + zip(train_sents, train_head_pos, train_tail_pos, train_mask_pos, + train_rel_tokens)) + test_data = list( + zip(test_sents, test_head_pos, test_tail_pos, test_mask_pos, + test_rel_tokens)) + + if config.model_name == 'Bert': + train_data = list(zip(train_lm_sents, train_rel_tokens)) + test_data = list(zip(test_lm_sents, test_rel_tokens)) + + ensure_dir(out_path) + train_data_path = os.path.join(out_path, 'train.pkl') + test_data_path = os.path.join(out_path, 'test.pkl') + + save_pkl(train_data_path, train_data, 'train data') + save_pkl(test_data_path, test_data, 'test data') + + if config.model_name == 'Bert': + train_lm_data_path = os.path.join(out_path, 'train_lm.pkl') + test_lm_data_path = os.path.join(out_path, 'test_lm.pkl') + + save_pkl(train_lm_data_path, train_data, 'train data') + save_pkl(test_lm_data_path, test_data, 'test data') + + print('===== end preprocess data =====') + + +if __name__ == "__main__": + data_path = '../data/origin' + out_path = '../data/out' + + process(data_path, out_path, file_type='csv') diff --git a/deepke/trainer.py b/deepke/trainer.py new file mode 100644 index 0000000..be71e5c --- /dev/null +++ b/deepke/trainer.py @@ -0,0 +1,72 @@ +import torch +import numpy as np +import matplotlib.pyplot as plt +from sklearn.metrics import precision_recall_fscore_support +from deepke.utils import to_one_hot + + +def train(epoch, device, dataloader, model, optimizer, criterion, config): + model.train() + total_loss = [] + + for batch_idx, batch in enumerate(dataloader, 1): + *x, y = [data.to(device) for data in batch] + optimizer.zero_grad() + y_pred = model(x) + + if model.model_name == 'Capsule': + y = to_one_hot(y,config.relation_type) + loss = model.loss(y_pred, y) + else: + loss = criterion(y_pred, y) + + loss.backward() + optimizer.step() + total_loss.append(loss.item()) + + # logging + data_cal = len(dataloader.dataset) if batch_idx == len( + dataloader) else batch_idx * len(y) + if (config.train_log and batch_idx % + config.log_interval == 0) or batch_idx == len(dataloader): + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, data_cal, len(dataloader.dataset), + 100. * batch_idx / len(dataloader), loss.item())) + + # plot + if config.show_plot: + plt.plot(total_loss) + plt.show() + + +def validate(dataloader, model, device, config): + model.eval() + + with torch.no_grad(): + total_y_true = np.empty(0) + total_y_pred = np.empty(0) + for batch_idx, batch in enumerate(dataloader, 1): + *x, y = [data.to(device) for data in batch] + y_pred = model(x) + + if model.model_name == 'Capsule': + y_pred = model.predict(y_pred) + else: + y_pred = y_pred.argmax(dim=-1) + + try: + y, y_pred = y.numpy(), y_pred.numpy() + except: + y, y_pred = y.cpu().numpy(), y_pred.cpu().numpy() + total_y_true = np.append(total_y_true, y) + total_y_pred = np.append(total_y_pred, y_pred) + + total_f1 = [] + for average in config.f1_norm: + p, r, f1, _ = precision_recall_fscore_support(total_y_true, + total_y_pred, + average=average) + print(f' {average} metrics: [p: {p:.4f}, r:{r:.4f}, f1:{f1:.4f}]') + total_f1.append(f1) + + return total_f1 diff --git a/deepke/utils.py b/deepke/utils.py new file mode 100644 index 0000000..8570035 --- /dev/null +++ b/deepke/utils.py @@ -0,0 +1,240 @@ +import os +import csv +import json +import torch +import pickle +import random +import warnings +import numpy as np +from functools import reduce +from typing import Dict, List, Tuple, Set, Any + +__all__ = [ + 'to_one_hot', + 'seq_len_to_mask', + 'ignore_waring', + 'make_seed', + 'load_pkl', + 'save_pkl', + 'ensure_dir', + 'load_csv', + 'load_jsonld', + 'jsonld2csv', + 'csv2jsonld', +] + + +def to_one_hot(x, length): + batch_size = x.size(0) + x_one_hot = torch.zeros(batch_size, length).to(x.device) + for i in range(batch_size): + x_one_hot[i, x[i]] = 1.0 + return x_one_hot + + +def model_summary(model): + """ + 得到模型的总参数量 + + :params model: Pytorch 模型 + :return tuple: 包含总参数量,可训练参数量,不可训练参数量 + """ + train = [] + nontrain = [] + + def layer_summary(module): + def count_size(sizes): + return reduce(lambda x, y: x * y, sizes) + + for p in module.parameters(recurse=False): + if p.requires_grad: + train.append(count_size(p.shape)) + else: + nontrain.append(count_size(p.shape)) + for subm in module.children(): + layer_summary(subm) + + layer_summary(model) + total_train = sum(train) + total_nontrain = sum(nontrain) + total = total_train + total_nontrain + strings = [] + strings.append('Total params: {:,}'.format(total)) + strings.append('Trainable params: {:,}'.format(total_train)) + strings.append('Non-trainable params: {:,}'.format(total_nontrain)) + max_len = len(max(strings, key=len)) + bar = '-' * (max_len + 3) + strings = [bar] + strings + [bar] + print('\n'.join(strings)) + return total, total_train, total_nontrain + + +def seq_len_to_mask(seq_len, max_len=None): + """ + + 将一个表示sequence length的一维数组转换为二维的mask,不包含的位置为0。 + 转变 1-d seq_len到2-d mask. + + .. code-block:: + + >>> seq_len = torch.arange(2, 16) + >>> mask = seq_len_to_mask(seq_len) + >>> print(mask.size()) + torch.Size([14, 15]) + >>> seq_len = np.arange(2, 16) + >>> mask = seq_len_to_mask(seq_len) + >>> print(mask.shape) + (14, 15) + >>> seq_len = torch.arange(2, 16) + >>> mask = seq_len_to_mask(seq_len, max_len=100) + >>>print(mask.size()) + torch.Size([14, 100]) + + :param np.ndarray,torch.LongTensor seq_len: shape将是(B,) + :param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有 + 区别,所以需要传入一个max_len使得mask的长度是pad到该长度。 + :return: np.ndarray, torch.Tensor 。shape将是(B, max_length), 元素类似为bool或torch.uint8 + """ + if isinstance(seq_len, np.ndarray): + assert len( + np.shape(seq_len) + ) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." + max_len = int(max_len) if max_len else int(seq_len.max()) + broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1)) + mask = broad_cast_seq_len < seq_len.reshape(-1, 1) + + elif isinstance(seq_len, torch.Tensor): + assert seq_len.dim( + ) == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}." + batch_size = seq_len.size(0) + max_len = int(max_len) if max_len else seq_len.max().long() + broad_cast_seq_len = torch.arange(max_len).expand(batch_size, + -1).to(seq_len) + mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) + else: + raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.") + + return mask + + +def ignore_waring(): + warnings.filterwarnings("ignore") + + +def make_seed(num: int = 1) -> None: + random.seed(num) + np.random.seed(num) + torch.manual_seed(num) + torch.cuda.manual_seed(num) + torch.cuda.manual_seed_all(num) + + +def load_pkl(fp: str, obj_name: str = 'data', verbose: bool = True) -> Any: + if verbose: + print(f'load {obj_name} in {fp}') + with open(fp, 'rb') as f: + data = pickle.load(f) + return data + + +def save_pkl(fp: str, obj, obj_name: str = 'data', + verbose: bool = True) -> None: + if verbose: + print(f'save {obj_name} in {fp}') + with open(fp, 'wb') as f: + pickle.dump(obj, f) + + +def ensure_dir(d: str, verbose: bool = True) -> None: + ''' + 判断目录是否存在,不存在时创建 + :param d: directory + :param verbose: whether print logging + :return: None + ''' + if not os.path.exists(d): + if verbose: + print("Directory '{}' do not exist; creating...".format(d)) + os.makedirs(d) + + +def load_csv(fp: str) -> List: + print(f'load {fp}') + datas = [] + + with open(fp, encoding='utf-8') as f: + reader = csv.DictReader(f) + for line in reader: + data = list(line.values()) + datas.append(data) + return datas + + +def load_jsonld(fp: str) -> List: + print(f'load {fp}') + datas = [] + + with open(fp, encoding='utf-8') as f: + for l in f: + line = json.loads(l) + data = list(line.values()) + datas.append(data) + return datas + + +def jsonld2csv(fp: str, verbose: bool = True) -> str: + ''' + 读入 jsonld 文件,存储在同位置同名的 csv 文件 + :param fp: jsonld 文件地址 + :param verbose: whether print logging + :return: csv 文件地址 + ''' + data = [] + root, ext = os.path.splitext(fp) + fp_new = root + '.csv' + if verbose: + print(f'read jsonld file in: {fp}') + with open(fp, encoding='utf-8') as f: + for l in f: + line = json.loads(l) + data.append(line) + if verbose: + print('saving...') + with open(fp_new, 'w', encoding='utf-8') as f: + fieldnames = data[0].keys() + writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='excel') + writer.writeheader() + writer.writerows(data) + if verbose: + print(f'saved csv file in: {fp_new}') + return fp_new + + +def csv2jsonld(fp: str, verbose: bool = True) -> str: + ''' + 读入 csv 文件,存储为同位置同名的 jsonld 文件 + :param fp: csv 文件地址 + :param verbose: whether print logging + :return: jsonld 地址 + ''' + data = [] + root, ext = os.path.splitext(fp) + fp_new = root + '.jsonld' + if verbose: + print(f'read csv file in: {fp}') + with open(fp, encoding='utf-8') as f: + writer = csv.DictReader(f, fieldnames=None, dialect='excel') + for line in writer: + data.append(line) + if verbose: + print('saving...') + with open(fp_new, 'w', encoding='utf-8') as f: + f.write( + os.linesep.join([json.dumps(l, ensure_ascii=False) for l in data])) + if verbose: + print(f'saved jsonld file in: {fp_new}') + return fp_new + + +if __name__ == '__main__': + pass diff --git a/deepke/vocab.py b/deepke/vocab.py new file mode 100644 index 0000000..7702f55 --- /dev/null +++ b/deepke/vocab.py @@ -0,0 +1,76 @@ +from typing import List + +init_tokens = ['PAD', 'UNK'] + + +class Vocab(object): + def __init__(self, name: str, init_tokens: List[str] = init_tokens): + self.name = name + self.init_tokens = init_tokens + self.trimed = False + self.word2idx = {} + self.word2count = {} + self.idx2word = {} + self.count = 0 + self.add_init_tokens() + + def add_init_tokens(self): + for token in self.init_tokens: + self.add_word(token) + + def add_word(self, word): + if word not in self.word2idx: + self.word2idx[word] = self.count + self.word2count[word] = 1 + self.idx2word[self.count] = word + self.count += 1 + else: + self.word2count[word] += 1 + + def add_sent(self, sent: str): + for word in sent: + self.add_word(word) + + def trim(self, min_freq=2, verbose: bool = True): + ''' + 当 word 词频低于 min_freq 时,从词库中删除 + :param min_freq: 最低词频 + ''' + if self.trimed: + return + self.trimed = True + + keep_words = [] + new_words = [] + for k, v in self.word2count.items(): + if v >= min_freq: + keep_words.append(k) + new_words.extend([k] * v) + if verbose: + print('after trim, keep words [{} / {}] = {:.2f}%'.format( + len(keep_words + self.init_tokens), len(self.word2idx), + len(keep_words + self.init_tokens) / len(self.word2idx) * 100)) + + # Reinitialize dictionaries + self.word2idx = {} + self.word2count = {} + self.idx2word = {} + self.count = 0 + self.add_init_tokens() + for word in new_words: + self.add_word(word) + + +if __name__ == '__main__': + from nltk import word_tokenize + vocab = Vocab('test') + sent = ' 我是中国人,我爱中国。' + # english + # sent = "I'm chinese, I love China." + # words = word_tokenize(sent) + print(sent, '\n') + vocab.add_sent(sent) + print(vocab.word2idx) + print(vocab.word2count) + vocab.trim(2) + print(vocab.word2idx) diff --git a/images/APCNN.jpg b/images/APCNN.jpg new file mode 100644 index 0000000..8aab6e2 Binary files /dev/null and b/images/APCNN.jpg differ diff --git a/images/Bert.png b/images/Bert.png new file mode 100644 index 0000000..980cee5 Binary files /dev/null and b/images/Bert.png differ diff --git a/images/CNN.png b/images/CNN.png new file mode 100644 index 0000000..7fb4369 Binary files /dev/null and b/images/CNN.png differ diff --git a/images/Capsule.png b/images/Capsule.png new file mode 100644 index 0000000..23e1de0 Binary files /dev/null and b/images/Capsule.png differ diff --git a/images/LSTM.jpg b/images/LSTM.jpg new file mode 100644 index 0000000..0ba1ad7 Binary files /dev/null and b/images/LSTM.jpg differ diff --git a/images/PCNN.jpg b/images/PCNN.jpg new file mode 100644 index 0000000..e91b00e Binary files /dev/null and b/images/PCNN.jpg differ diff --git a/images/Transformer1.png b/images/Transformer1.png new file mode 100644 index 0000000..c36a17d Binary files /dev/null and b/images/Transformer1.png differ diff --git a/images/Transformer2.png b/images/Transformer2.png new file mode 100644 index 0000000..2f85273 Binary files /dev/null and b/images/Transformer2.png differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..9189a1a --- /dev/null +++ b/main.py @@ -0,0 +1,109 @@ +import os +import argparse +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from deepke.config import config +from deepke import model +from deepke.utils import make_seed, load_pkl +from deepke.trainer import train, validate +from deepke.process import process +from deepke.dataset import CustomDataset, CustomLMDataset, collate_fn, collate_fn_lm + +__Models__ = { + "CNN": model.CNN, + "BiLSTM": model.BiLSTM, + "Transformer": model.Transformer, + "Capsule": model.Capsule, + "Bert": model.Bert, +} + +parser = argparse.ArgumentParser(description='choose your model') +parser.add_argument('--model_name', type=str, default='CNN', help='model name') +args = parser.parse_args() +model_name = args.model_name if args.model_name else config.model_name + +make_seed(config.seed) + +if config.use_gpu and torch.cuda.is_available(): + device = torch.device('cuda', config.gpu_id) +else: + device = torch.device('cpu') + +if not os.path.exists(config.out_path): + process(config.data_path, config.out_path, file_type='csv') + +if config.model_name == 'Bert': + vocab_path = os.path.join(config.out_path, 'bert_vocab.txt') + train_data_path = os.path.join(config.out_path, 'train_lm.pkl') + test_data_path = os.path.join(config.out_path, 'test_lm.pkl') +else: + vocab_path = os.path.join(config.out_path, 'vocab.pkl') + train_data_path = os.path.join(config.out_path, 'train.pkl') + test_data_path = os.path.join(config.out_path, 'test.pkl') + +vocab = load_pkl(vocab_path) +vocab_size = len(vocab.word2idx) + +if config.model_name == 'Bert': + train_dataset = CustomLMDataset(train_data_path) + train_dataloader = DataLoader(train_dataset, + batch_size=config.batch_size, + shuffle=True, + collate_fn=collate_fn_lm) + test_dataset = CustomLMDataset(test_data_path) + test_dataloader = DataLoader( + test_dataset, + batch_size=config.batch_size, + shuffle=False, + collate_fn=collate_fn_lm, + ) +else: + train_dataset = CustomDataset(train_data_path) + train_dataloader = DataLoader(train_dataset, + batch_size=config.batch_size, + shuffle=True, + collate_fn=collate_fn) + test_dataset = CustomDataset(test_data_path) + test_dataloader = DataLoader( + test_dataset, + batch_size=config.batch_size, + shuffle=False, + collate_fn=collate_fn, + ) + +model = __Models__[model_name](vocab_size, config) +model.to(device) +print(model) + +optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) +scheduler = optim.lr_scheduler.ReduceLROnPlateau( + optimizer, 'max', factor=config.decay_rate, patience=config.decay_patience) +criterion = nn.CrossEntropyLoss() + +best_macro_f1, best_macro_epoch = 0, 1 +best_micro_f1, best_micro_epoch = 0, 1 +best_macro_model, best_micro_model = '', '' +print('=' * 10, ' Start training ', '=' * 10) + +for epoch in range(1, config.epoch + 1): + train(epoch, device, train_dataloader, model, optimizer, criterion, config) + macro_f1, micro_f1 = validate(test_dataloader, model, device, config) + model_name = model.save(epoch=epoch) + scheduler.step(macro_f1) + + if macro_f1 > best_macro_f1: + best_macro_f1 = macro_f1 + best_macro_epoch = epoch + best_macro_model = model_name + if micro_f1 > best_micro_f1: + best_micro_f1 = micro_f1 + best_micro_epoch = epoch + best_micro_model = model_name + +print('=' * 10, ' End training ', '=' * 10) +print(f'best macro f1: {best_macro_f1:.4f},', + f'in epoch: {best_macro_epoch}, saved in: {best_macro_model}') +print(f'best micro f1: {best_micro_f1:.4f},', + f'in epoch: {best_micro_epoch}, saved in: {best_micro_model}') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..22c3517 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +python>=3.6 +torch>=1.0 +jieba>=0.39 +scikit_learn>=0.21 +pytorch_transformers>=1.0