Navigation

    Gpushare.com

    • Register
    • Login
    • Search
    • Popular
    • Categories
    • Recent
    • Tags

    医疗命名实体识别不同参数准确率

    语音识别与语义处理领域
    1
    1
    64
    Loading More Posts
    • Oldest to Newest
    • Newest to Oldest
    • Most Votes
    Reply
    • Reply as topic
    Log in to reply
    This topic has been deleted. Only users with topic management privileges can see it.
    • 1
      152****5202 last edited by 152****5202

      实验环境
      nvidia-3090

      选择pytorch为底层框架

      可以同时在一张卡中同时跑多个模型。我们这里实验环境采用同类别四种模型参数的模型进行支撑。
      tiny版本bert参数

      {
        "emb_size": 128,
        "feedforward_size": 512,
        "hidden_size": 128,
        "hidden_act": "gelu",
        "heads_num": 2,
        "layers_num": 2,
        "max_seq_length": 512,
        "dropout": 0.1,
        "embedding": "word_pos_seg",
        "encoder": "transformer",
        "mask": "fully_visible",
        "target": "bert"
      }
      

      small版本参数

      {
        "emb_size": 512,
        "feedforward_size": 2048,
        "hidden_size": 512,
        "hidden_act": "gelu",
        "heads_num": 8,
        "layers_num": 4,
        "max_seq_length": 512,
        "dropout": 0.1,
        "embedding": "word_pos_seg",
        "encoder": "transformer",
        "mask": "fully_visible",
        "target": "bert"
      }
      

      middle 参数

      {
        "emb_size": 512,
        "feedforward_size": 2048,
        "hidden_size": 512,
        "hidden_act": "gelu",
        "heads_num": 8,
        "layers_num": 8,
        "max_seq_length": 512,
        "dropout": 0.1,
        "embedding": "word_pos_seg",
        "encoder": "transformer",
        "mask": "fully_visible",
        "target": "bert"
      }
      

      large 参数

      {
        "emb_size": 1024,
        "feedforward_size": 4096,
        "hidden_size": 1024,
        "hidden_act": "gelu",
        "heads_num": 16,
        "layers_num": 24,
        "max_seq_length": 512,
        "dropout": 0.1,
        "embedding": "word_pos_seg",
        "encoder": "transformer",
        "mask": "fully_visible",
        "target": "bert"
      }
      

      实体总共以下几个类别

      {'TargetedTreatB': 0, 'BodyPartsB': 1, 'AbnormalType': 2, 'IOPB': 3, 'PathogenB': 4, 'Symptom': 5, 'DEPB': 6, 'Disease': 7, 'ImageTB': 8, 'MedEquipB': 9, 'BodyFunction': 10, 'Surgery': 11, 'MedEquip': 12, 'SignsB': 13, 'DiseaseB': 14, 'SurgeryB': 15, 'SymptomB': 16, 'TargetedTreat': 17, 'EnMedOrder': 18, 'EnMedOrderB': 19, 'DEP': 20, 'LabTB': 21, 'O': 22, 'AbnormalTypeB': 23, 'Drug': 24, 'BodyParts': 25, 'BodyFunctionB': 26, 'IOP': 27, 'BodySubstanceB': 28, 'GeneralTest': 29, 'Pathogen': 30, 'ImageT': 31, 'BodySubstance': 32, 'GeneralTestB': 33, 'Signs': 34, 'LabT': 35, 'DrugB': 36}
      

      核心代码

      import argparse
      import os
      import sys
      
      import torch.nn as nn
      import torch.nn.functional as F
      
      uer_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
      sys.path.append(uer_dir)
      
      from uer.layers import *
      from uer.encoders import *
      from uer.utils.config import load_hyperparam
      from uer.utils.optimizers import *
      from uer.utils.seed import set_seed
      from uer.utils.tokenizers import *
      from uer.opts import finetune_opts
      import pandas as pd
      from finetune.run_classifier import build_optimizer, load_or_initialize_parameters
      
      
      class NerTagger(nn.Module):
          def __init__(self, args):
              super(NerTagger, self).__init__()
              self.embedding = str2embedding[args.embedding](args, len(args.tokenizer.vocab))
              self.encoder = str2encoder[args.encoder](args)
              self.labels_num = args.labels_num
              self.output_layer = nn.Linear(args.hidden_size, self.labels_num)
              self.crf_target = args.crf_target
              if args.crf_target:
                  from torchcrf import CRF
                  self.crf = CRF(self.labels_num, batch_first=True)
                  self.seq_length = args.seq_length
      
          def forward(self, src, tgt, seg):
              """
              向前传播
              Args:
                  src: [batch_size x seq_length]
                  tgt: [batch_size x seq_length]
                  seg: [batch_size x seq_length]
              Returns:
                  loss: Sequence labeling loss.
                  logits: Output logits.
              """
              # Embedding.
              emb = self.embedding(src, seg)
              # Encoder.
              output = self.encoder(emb, seg)
      
              # Target.
              logits = self.output_layer(output)
              if self.crf_target:
                  tgt_mask = seg.type(torch.uint8)
                  pred = self.crf.decode(logits, mask=tgt_mask)
                  for j in range(len(pred)):
                      while len(pred[j]) < self.seq_length:
                          pred[j].append(self.labels_num - 1)
                  pred = torch.tensor(pred).contiguous().view(-1)
                  if tgt is not None:
                      loss = -self.crf(F.log_softmax(logits, 2), tgt, mask=tgt_mask, reduction='mean')
                      return loss, pred
                  else:
                      return None, pred
              else:
                  tgt_mask = seg.contiguous().view(-1).float()
                  logits = logits.contiguous().view(-1, self.labels_num)
                  pred = logits.argmax(dim=-1)
                  if tgt is not None:
                      tgt = tgt.contiguous().view(-1, 1)
                      one_hot = torch.zeros(tgt.size(0), self.labels_num). \
                          to(torch.device(tgt.device)). \
                          scatter_(1, tgt, 1.0)
                      numerator = -torch.sum(nn.LogSoftmax(dim=-1)(logits) * one_hot, 1)
                      numerator = torch.sum(tgt_mask * numerator)
                      denominator = torch.sum(tgt_mask) + 1e-6
                      loss = numerator / denominator
                      return loss, pred
                  else:
                      return None, pred
      
      
      def read_dataset(args, path):
          dataset, columns = [], {}
          with open(path, mode="r", encoding="utf-8") as f:
              for line_id, line in enumerate(f):
                  if line_id == 0:
                      for i, column_name in enumerate(line.strip().split("\t")):
                          columns[column_name] = i
                      continue
                  line = line.strip().split("\t")
                  labels = line[columns["label"]]
                  tgt = [args.l2i[l] for l in labels.split(" ")]
      
                  text_a = line[columns["text_a"]]
                  src = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(text_a))
                  seg = [1] * len(src)
      
                  if len(src) > args.seq_length:
                      src = src[: args.seq_length]
                      tgt = tgt[: args.seq_length]
                      seg = seg[: args.seq_length]
                  PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
                  while len(src) < args.seq_length:
                      src.append(PAD_ID)
                      tgt.append(args.labels_num - 1)
                      seg.append(0)
                  dataset.append([src, tgt, seg])
      
          return dataset
      
      
      def read_bank_dataset(args, path):
          """
          读取银行评论命名实体识别数据集
          """
          dataset, columns = [], {}
          train_data = pd.read_csv(path)
          for line, labels in zip(train_data.text, train_data.BIO_anno):
              src = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(" ".join(list(line))))
      
              tgt = [args.l2i[l] for l in labels.split(" ")]
              seg = [1] * len(src)
              if len(src) > args.seq_length:
                  src = src[: args.seq_length]
                  tgt = tgt[: args.seq_length]
                  seg = seg[: args.seq_length]
              PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
      
              while len(src) < args.seq_length:
                  src.append(PAD_ID)
                  seg.append(0)
              while len(tgt) < args.seq_length:
                  tgt.append(4)
              if len(src) == len(tgt):
                  dataset.append([src, tgt, seg])
          # print(dataset[0])
          # print()
          return dataset
      
      
      def read_medical_dataset(args, path):
          """
          读取银行评论命名实体识别数据集
          """
          dataset, columns = [], {}
          train_data = json.load(open(path, "r"))
          # print(train_data)
          for train_data in train_data:
              labels = train_data["ner_tags"]
              line = train_data["tokens"]
              # print(line)
              # print(labels)
              # 去 某 宝 上 买 张 , 上 传 , 肯 定 可 以 过 的 , 放 心 吧
              src = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(" ".join(line)))
              # print(" ".join(list(line)))
              # print(args.tokenizer.tokenize(" ".join(list(line))))
              # print(src)
      
              tgt = [args.l2i[l] for l in labels]
              seg = [1] * len(src)
              if len(src) > args.seq_length:
                  src = src[: args.seq_length]
                  tgt = tgt[: args.seq_length]
                  seg = seg[: args.seq_length]
              PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
      
              while len(src) < args.seq_length:
                  src.append(PAD_ID)
                  seg.append(0)
              while len(tgt) < args.seq_length:
                  tgt.append(args.l2i["O"])
              if len(src) == len(tgt):
                  dataset.append([src, tgt, seg])
          # print(dataset[0])
          # print()
          return dataset
      
      
      def batch_loader(batch_size, src, tgt, seg):
          instances_num = src.size()[0]
          for i in range(instances_num // batch_size):
              src_batch = src[i * batch_size: (i + 1) * batch_size, :]
              tgt_batch = tgt[i * batch_size: (i + 1) * batch_size, :]
              seg_batch = seg[i * batch_size: (i + 1) * batch_size, :]
              yield src_batch, tgt_batch, seg_batch
          if instances_num > instances_num // batch_size * batch_size:
              src_batch = src[instances_num // batch_size * batch_size:, :]
              tgt_batch = tgt[instances_num // batch_size * batch_size:, :]
              seg_batch = seg[instances_num // batch_size * batch_size:, :]
              yield src_batch, tgt_batch, seg_batch
      
      
      def train(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch):
          model.zero_grad()
      
          src_batch = src_batch.to(args.device)
          tgt_batch = tgt_batch.to(args.device)
          seg_batch = seg_batch.to(args.device)
          # print(src_batch, tgt_batch, seg_batch)
          loss, _ = model(src_batch, tgt_batch, seg_batch)
          if torch.cuda.device_count() > 1:
              loss = torch.mean(loss)
      
          if args.fp16:
              with amp.scale_loss(loss, optimizer) as scaled_loss:
                  scaled_loss.backward()
          else:
              loss.backward()
      
          optimizer.step()
          scheduler.step()
      
          return loss
      
      
      def evaluate(args, dataset):
          src = torch.LongTensor([sample[0] for sample in dataset])
          tgt = torch.LongTensor([sample[1] for sample in dataset])
          seg = torch.LongTensor([sample[2] for sample in dataset])
      
          batch_size = args.batch_size
      
          correct, gold_entities_num, pred_entities_num = 0, 0, 0
      
          args.model.eval()
      
          for i, (src_batch, tgt_batch, seg_batch) in enumerate(batch_loader(batch_size, src, tgt, seg)):
              src_batch = src_batch.to(args.device)
              tgt_batch = tgt_batch.to(args.device)
              seg_batch = seg_batch.to(args.device)
              loss, pred = args.model(src_batch, tgt_batch, seg_batch)
      
              gold = tgt_batch.contiguous().view(-1, 1)
      
              for j in range(gold.size()[0]):
                  if gold[j].item() in args.begin_ids:
                      gold_entities_num += 1
      
              for j in range(pred.size()[0]):
                  if pred[j].item() in args.begin_ids and gold[j].item() != args.l2i["[PAD]"]:
                      pred_entities_num += 1
      
              pred_entities_pos, gold_entities_pos = set(), set()
      
              for j in range(gold.size()[0]):
                  if gold[j].item() in args.begin_ids:
                      start = j
                      for k in range(j + 1, gold.size()[0]):
                          if gold[k].item() == args.l2i["[PAD]"] or gold[k].item() == args.l2i["O"] or gold[
                              k].item() in args.begin_ids:
                              end = k - 1
                              break
                      else:
                          end = gold.size()[0] - 1
                      gold_entities_pos.add((start, end))
      
              for j in range(pred.size()[0]):
                  if pred[j].item() in args.begin_ids and gold[j].item() != args.l2i["[PAD]"]:
                      start = j
                      for k in range(j + 1, pred.size()[0]):
                          if pred[k].item() == args.l2i["[PAD]"] or pred[k].item() == args.l2i["O"] or pred[
                              k].item() in args.begin_ids:
                              end = k - 1
                              break
                      else:
                          end = pred.size()[0] - 1
                      pred_entities_pos.add((start, end))
      
              for entity in pred_entities_pos:
                  if entity not in gold_entities_pos:
                      continue
                  for j in range(entity[0], entity[1] + 1):
                      if gold[j].item() != pred[j].item():
                          break
                  else:
                      correct += 1
      
          print("Report precision, recall, and f1:")
          eps = 1e-9
          p = correct / (pred_entities_num + eps)
          r = correct / (gold_entities_num + eps)
          f1 = 2 * p * r / (p + r + eps)
          print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1))
      
          return f1
      
      
      def main():
          parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
      
          finetune_opts(parser)
      
          parser.add_argument("--vocab_path", default=None, type=str,
                              help=" 词汇表文件的路径。 Path of the vocabulary file.")
          parser.add_argument("--model_label", default=None, type=str,
                              help=" 模型类别。 Path of the vocabulary file.")
          parser.add_argument("--spm_model_path", default=None, type=str,
                              help=" 句子片段模型的路径。  Path of the sentence piece model.")
          parser.add_argument("--label2id_path", type=str, required=True,
                              help="Path of the label2id file.")
          parser.add_argument("--crf_target", action="store_true",
                              help="Use CRF loss as the target function or not, default False.")
      
          args = parser.parse_args()
      
          # Load the hyper parameters of the config file.
          args = load_hyperparam(args)
      
          set_seed(args.seed)
      
          args.begin_ids = []
          with open(args.label2id_path, mode="r", encoding="utf-8") as f:
              l2i = json.load(f)
              print("Labels: ", l2i)
              l2i["[PAD]"] = len(l2i)
              for label in l2i:
                  if label[-1] == "B":
                      args.begin_ids.append(l2i[label])
      
          args.l2i = l2i
      
          args.labels_num = len(l2i)
      
          # 读取文本转id的工具
          args.tokenizer = SpaceTokenizer(args)
      
          # Build sequence labeling model.
          print(args)
          # for i in list(args):
          #     print(i)
          model = NerTagger(args)
      
          # Load or initialize parameters.
          load_or_initialize_parameters(args, model)
      
          args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
          model = model.to(args.device)
      
          # Training phase.
          instances = read_medical_dataset(args, args.train_path)
      
          src = torch.LongTensor([ins[0] for ins in instances])
          tgt = torch.LongTensor([ins[1] for ins in instances])
          seg = torch.LongTensor([ins[2] for ins in instances])
      
          instances_num = src.size(0)
          batch_size = args.batch_size
          args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1
      
          print("Batch size: ", batch_size)
          print("The number of training instances:", instances_num)
      
          optimizer, scheduler = build_optimizer(args, model)
      
          if args.fp16:
              try:
                  from apex import amp
              except ImportError:
                  raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
              model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
      
          if torch.cuda.device_count() > 1:
              print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
              model = torch.nn.DataParallel(model)
          args.model = model
      
          total_loss, f1, best_f1 = 0.0, 0.0, 0.0
      
          print("Start training.")
      
          for epoch in range(1, args.epochs_num + 1):
              model.train()
              src_batch, tgt_batch, seg_batch = [], [], []
              for i, (src_batch, tgt_batch, seg_batch) in enumerate(batch_loader(batch_size, src, tgt, seg)):
                  loss = train(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch)
                  total_loss += loss.item()
                  if (i + 1) % args.report_steps == 0:
                      print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i + 1,
                                                                                        total_loss / args.report_steps))
                      total_loss = 0.0
      
              f1 = evaluate(args, read_medical_dataset(args, args.dev_path))
              if f1 > best_f1:
                  best_f1 = f1
      
                  torch.save(model.state_dict(), args.output_model_path)
                  if not os.path.exists(args.model_label):
                      os.mkdir(args.model_label)
                  torch.onnx.export(model, (src_batch.to(args.device), tgt_batch.to(args.device), seg_batch.to(args.device)),
                                    os.path.join(args.model_label, "uer_py_ner_part" + str(best_f1) + ".onnx"), verbose=True)
      
      
              else:
                  continue
      
          # Evaluation phase.
          if args.test_path is not None:
              print("Test set evaluation.")
              if torch.cuda.device_count() > 1:
                  args.model.module.load_state_dict(torch.load(args.output_model_path))
              else:
                  args.model.load_state_dict(torch.load(args.output_model_path))
              evaluate(args, read_medical_dataset(args, args.test_path))
      
      
      if __name__ == "__main__":
          main()
      
      

      启动参数
      tiny命名实体识别训练

      python finetune/run_ner_medical.py --train_path data/medical_ner.json --dev_path data/medical_ner.json  --output_model_path models/part.bin --label2id_path data/medical_label2id.json --vocab_path models/google_zh_vocab.txt --batch_size 64 --epochs_num 300 --config_path models/bert/tiny_config.json --model_label onnx_tiny
      

      mini命名实体识别训练

      python finetune/run_ner_medical.py --train_path data/medical_ner.json --dev_path data/medical_ner.json  --output_model_path models/part.bin --label2id_path data/medical_label2id.json --vocab_path models/google_zh_vocab.txt --batch_size 16 --epochs_num 300 --config_path models/bert/mini_config.json --model_label onnx_mini
      

      small命名实体识别训练

      python finetune/run_ner_medical.py --train_path data/medical_ner.json --dev_path data/medical_ner.json  --output_model_path models/part.bin --label2id_path data/medical_label2id.json --vocab_path models/google_zh_vocab.txt --batch_size 16 --epochs_num 300 --config_path models/bert/small_config.json --model_label onnx_small
      
      1 Reply Last reply Reply Quote 2
      • First post
        Last post